ggml : do not sched_yield when calling BLAS (llama/4761)

author Georgi Gerganov <redacted>

Fri, 5 Jan 2024 13:18:21 +0000 (15:18 +0200)

committer Georgi Gerganov <redacted>

Thu, 11 Jan 2024 19:50:00 +0000 (21:50 +0200)
author Georgi Gerganov <redacted>
Fri, 5 Jan 2024 13:18:21 +0000 (15:18 +0200)
committer Georgi Gerganov <redacted>
Thu, 11 Jan 2024 19:50:00 +0000 (21:50 +0200)
diff --git a/ggml.c b/ggml.c

index b124f14cc15eee4feb1bb82700e5591729fd9023..62f0f18ef3b700dcb3905dbb46e4cc418f574595 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
  // helper function to determine if it is better to use BLAS or not
  // for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
+static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
      //const int64_t ne00 = src0->ne[0];
      //const int64_t ne01 = src0->ne[1];
  
@@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
  #endif
  
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
          if (params->ith != 0) {
              return;
          }
@@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
  
                  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-
-#if defined(GGML_USE_CUBLAS)
-                if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#elif defined(GGML_USE_CLBLAST)
-                if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#endif
              } break;
          case GGML_OP_MUL_MAT_ID:
              {
@@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
              state->shared->node_n += 1;
              return (thread_ret_t) GGML_EXIT_ABORTED;
          }
+
          if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
              // all other threads are finished and spinning
              // do finalize and init here so we don't have synchronize again
@@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
          } else {
              // wait for other threads to finish
              const int last = node_n;
+
+            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
+
              while (true) {
                  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                  //       depending on the workload and the operating system.
                  //       since it is not clear what is the best approach, it should potentially become user-configurable
                  //       ref: https://github.com/ggerganov/ggml/issues/291
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                sched_yield();
-#endif
+                // UPD:  adding the do_yield flag seems to resolve the issue universally
+                if (do_yield) {
+                    sched_yield();
+                }
  
                  node_n = atomic_load(&state->shared->node_n);
                  if (node_n != last) break;
@@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                      } else
  #endif
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                          if (node->src[0]->type != GGML_TYPE_F32) {
                              // here we need memory just for single 2D matrix from src0
                              cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
author	Georgi Gerganov <redacted>
	Fri, 5 Jan 2024 13:18:21 +0000 (15:18 +0200)
committer	Georgi Gerganov <redacted>
	Thu, 11 Jan 2024 19:50:00 +0000 (21:50 +0200)