ggml : parallelize FP32 conversion when using BLAS (llama/5045)

author Reinforce-II <redacted>

Mon, 22 Jan 2024 13:15:08 +0000 (21:15 +0800)

committer Georgi Gerganov <redacted>

Sat, 27 Jan 2024 15:19:51 +0000 (17:19 +0200)
author Reinforce-II <redacted>
Mon, 22 Jan 2024 13:15:08 +0000 (21:15 +0800)
committer Georgi Gerganov <redacted>
Sat, 27 Jan 2024 15:19:51 +0000 (17:19 +0200)
diff --git a/ggml.c b/ggml.c

index cf245d5f93ce76c62bebd023c60347222d8e059c..2a52c5ff0a60c20a549788c65d78e336ae2c5ab0 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -7815,6 +7815,9 @@ static void ggml_compute_forward_acc_f32(
      bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
  
      if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
          // memcpy needs to be synchronized across threads to avoid race conditions.
          // => do it in INIT phase
          memcpy(
@@ -9957,11 +9960,30 @@ static void ggml_compute_forward_mul_mat(
  
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
      if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
-            return;
-        }
+        const int64_t ne_plane      = ne01*ne00;
+        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
+        UNUSED(desired_wsize);
  
          if (params->type == GGML_TASK_INIT) {
+            if (type != GGML_TYPE_F32) {
+                assert(params->wsize >= desired_wsize);
+                // parallelize by src0 rows
+                for (int64_t i13 = 0; i13 < ne13; i13++) {
+                    for (int64_t i12 = 0; i12 < ne12; i12++) {
+                        // broadcast src0 into src1 across 2nd,3rd dimension
+                        const int64_t i03 = i13/r3;
+                        const int64_t i02 = i12/r2;
+
+                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
+                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+                              ggml_to_float_t  const to_float = type_traits[type].to_float;
+
+                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
+                        }
+                    }
+                }
+            }
              return;
          }
  
@@ -9969,9 +9991,14 @@ static void ggml_compute_forward_mul_mat(
              return;
          }
  
+        // perform sgemm, parallelization controlled by blas lib
+        if (ith != 0) {
+            return;
+        }
+
+        const int64_t tgemm0 = ggml_perf_time_us();
          for (int64_t i13 = 0; i13 < ne13; i13++) {
              for (int64_t i12 = 0; i12 < ne12; i12++) {
-                // broadcast src0 into src1 across 2nd,3rd dimension
                  const int64_t i03 = i13/r3;
                  const int64_t i02 = i12/r2;
  
@@ -9980,17 +10007,7 @@ static void ggml_compute_forward_mul_mat(
                        float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
  
                  if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
-                    ggml_to_float_t const to_float = type_traits[type].to_float;
-
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                    x = wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                  }
  
                  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -10000,6 +10017,7 @@ static void ggml_compute_forward_mul_mat(
                           0.0f,    d, ne01);
              }
          }
+        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
  
          //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
  
@@ -10008,6 +10026,9 @@ static void ggml_compute_forward_mul_mat(
  #endif
  
      if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          if (src1->type != vec_dot_type) {
              char * wdata = params->wdata;
              const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10172,6 +10193,9 @@ static void ggml_compute_forward_mul_mat_id(
      #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
  
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          char * wdata = params->wdata;
          if (src1->type != vec_dot_type) {
              const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10357,6 +10381,9 @@ static void ggml_compute_forward_out_prod_f32(
              return;
          }
  #endif
+        if (ith != 0) {
+            return;
+        }
          ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
          return;
      }
@@ -10540,6 +10567,9 @@ static void ggml_compute_forward_out_prod_q_f32(
      // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
  
      if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
          return;
      }
@@ -10724,6 +10754,9 @@ static void ggml_compute_forward_set_f32(
      bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
  
      if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
          // memcpy needs to be synchronized across threads to avoid race conditions.
          // => do it in INIT phase
          memcpy(
@@ -11048,6 +11081,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
      // ggml_compute_forward_dup_same_cont(params, opt0, dst);
  
      if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
          memset(dst->data, 0, ggml_nbytes(dst));
      }
  
@@ -11082,6 +11118,9 @@ static void ggml_compute_forward_get_rows_back_f32(
      // ggml_compute_forward_dup_same_cont(params, opt0, dst);
  
      if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
          memset(dst->data, 0, ggml_nbytes(dst));
      }
  
@@ -11219,6 +11258,9 @@ static void ggml_compute_forward_diag_mask_f32(
      GGML_ASSERT(n_past >= 0);
  
      if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (ith != 0) {
+            return;
+        }
          // memcpy needs to be synchronized across threads to avoid race conditions.
          // => do it in INIT phase
          GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12189,6 +12231,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
      GGML_ASSERT(nb10 == sizeof(float));
  
      if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          memset(params->wdata, 0, params->wsize);
  
          // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12283,6 +12328,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
      GGML_ASSERT(nb10 == sizeof(float));
  
      if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          memset(params->wdata, 0, params->wsize);
  
          // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12507,6 +12555,9 @@ static void ggml_compute_forward_conv_transpose_2d(
      GGML_ASSERT(nb10 == sizeof(float));
  
      if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
          memset(params->wdata, 0, params->wsize);
  
          // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -14121,6 +14172,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
  
      const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
      if (!inplace && params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
          memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
          return;
      }
@@ -16414,8 +16468,9 @@ struct ggml_compute_state_shared {
      const int n_threads;
  
      // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
+    atomic_int n_active;  // num active threads
+    atomic_int node_n;    // active graph node
+    atomic_int node_task; // active graph node task phase
  
      bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
      void * abort_callback_data;
@@ -16663,6 +16718,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
      return n_tasks;
  }
  
+static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_node_n = * node_n;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * node_n = atomic_load(&state->shared->node_n);
+        if (* node_n != last_node_n) break;
+    }
+}
+
+static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_task_phase = * task_phase;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * task_phase = atomic_load(&state->shared->node_task);
+        if (* task_phase != last_task_phase) break;
+    }
+}
+
  static thread_ret_t ggml_graph_compute_thread(void * data) {
      struct ggml_compute_state * state = (struct ggml_compute_state *) data;
  
@@ -16673,7 +16756,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
      set_numa_thread_affinity(state->ith, n_threads);
  
-    int node_n = -1;
+    int node_n     = -1;
+    int task_phase = GGML_TASK_FINALIZE;
  
      while (true) {
          if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16713,13 +16797,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
                  params.nth = n_tasks;
  
-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
                  if (n_tasks == 1) {
+                    /* INIT */
+                    if (GGML_OP_HAS_INIT[node->op]) {
+                        params.type = GGML_TASK_INIT;
+                        ggml_compute_forward(&params, node);
+                    }
+
                      // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                      // they do something more efficient than spinning (?)
                      params.type = GGML_TASK_COMPUTE;
@@ -16740,38 +16824,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                  }
              }
  
-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
+            task_phase = GGML_TASK_INIT;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_n,    node_n);
+            atomic_store(&state->shared->node_task, task_phase);
          } else {
-            // wait for other threads to finish
-            const int last = node_n;
-
-            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
-            while (true) {
-                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-                //       depending on the workload and the operating system.
-                //       since it is not clear what is the best approach, it should potentially become user-configurable
-                //       ref: https://github.com/ggerganov/ggml/issues/291
-                // UPD:  adding the do_yield flag seems to resolve the issue universally
-                if (do_yield) {
-                    sched_yield();
-                }
-
-                node_n = atomic_load(&state->shared->node_n);
-                if (node_n != last) break;
-            };
+            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
          }
  
          // check if we should stop
          if (node_n >= cgraph->n_nodes) break;
  
-        /* COMPUTE */
+        /* INIT & COMPUTE */
          struct ggml_tensor * node = cgraph->nodes[node_n];
          const int n_tasks = ggml_get_n_tasks(node, n_threads);
  
          struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.type  =*/ GGML_TASK_INIT,
              /*.ith   =*/ state->ith,
              /*.nth   =*/ n_tasks,
              /*.wsize =*/ cplan->work_size,
@@ -16779,8 +16849,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
          };
  
          if (state->ith < n_tasks) {
+            if (GGML_OP_HAS_INIT[node->op]) {
+                ggml_compute_forward(&params, node);
+            }
+        }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_COMPUTE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+            //       depending on the workload and the operating system.
+            //       since it is not clear what is the best approach, it should potentially become user-configurable
+            //       ref: https://github.com/ggerganov/ggml/issues/291
+            // UPD:  adding the do_yield flag seems to resolve the issue universally
+            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
+            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
+        }
+
+        if (state->ith < n_tasks) {
+            params.type = GGML_TASK_COMPUTE;
              ggml_compute_forward(&params, node);
          }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_FINALIZE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+        }
      }
  
      return GGML_EXIT_SUCCESS;
@@ -16837,8 +16938,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                      if (ggml_compute_forward_mul_mat_use_blas(node)) {
                          if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            // here we need memory for fully dequantized matrix from src0
+                            cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
                          }
                      } else
  #endif
@@ -16992,6 +17093,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
          /*.n_threads               =*/ n_threads,
          /*.n_active                =*/ n_threads,
          /*.node_n                  =*/ -1,
+        /*.node_task               =*/ GGML_TASK_FINALIZE,
          /*.abort_callback          =*/ NULL,
          /*.abort_callback_data     =*/ NULL,
      };
author	Reinforce-II <redacted>
	Mon, 22 Jan 2024 13:15:08 +0000 (21:15 +0800)
committer	Georgi Gerganov <redacted>
	Sat, 27 Jan 2024 15:19:51 +0000 (17:19 +0200)