CLBlast: Add broadcast support for matrix multiplication (#3402)

author shibe2 <redacted>

Mon, 2 Oct 2023 19:26:15 +0000 (23:26 +0400)

committer GitHub <redacted>

Mon, 2 Oct 2023 19:26:15 +0000 (21:26 +0200)
author shibe2 <redacted>
Mon, 2 Oct 2023 19:26:15 +0000 (23:26 +0400)
committer GitHub <redacted>
Mon, 2 Oct 2023 19:26:15 +0000 (21:26 +0200)
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp

index c7d9150fec2f044dfc18f6246c476946ba208c8b..7e4069d76b2595737edfeb6c3415a4d8ef8949fe 100644 (file)
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
  
      const int64_t ne10 = src1->ne[0];
      const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
  
      const int nb2  = dst->nb[2];
      const int nb3  = dst->nb[3];
  
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
      const float alpha = 1.0f;
      const float beta = 0.0f;
      const int x_ne = ne01 * ne00;
@@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
      cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
      cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
  
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
              // copy data to device
-            if (src0->backend != GGML_BACKEND_GPU) {
+            if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
                  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
              }
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
  
              CL_CHECK(clFinish(queue));
  
@@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
              }
  
              // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
              CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
          }
      }
@@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
  
      const int64_t ne10 = src1->ne[0];
      const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
  
      const int nb10 = src1->nb[0];
      const int nb11 = src1->nb[1];
@@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
      const int nb2  = dst->nb[2];
      const int nb3  = dst->nb[3];
  
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
      const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
      const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
      const int x_ne = ne01 * ne00;
@@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
      bool src1_cont_rows = nb10 == sizeof(float);
      bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
  
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
              // copy src0 to device
-            if (src0->backend != GGML_BACKEND_GPU) {
+            if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
                  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
              }
  
              // convert src1 to fp16
              // TODO: use multiple threads
-            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
-            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
+            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
              if (src1_cont_rows) {
                  if (src1_cont_cols) {
                      ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                  }
                  else {
-                    for (int64_t i01 = 0; i01 < ne11; i01++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    for (int64_t i11 = 0; i11 < ne11; i11++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
                      }
                  }
              }
              else {
-                for (int64_t i01 = 0; i01 < ne11; i01++) {
-                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                for (int64_t i11 = 0; i11 < ne11; i11++) {
+                    for (int64_t i10 = 0; i10 < ne10; i10++) {
                          // very slow due to no inlining
-                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
                      }
                  }
              }
@@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
              // copy dst to host, then convert to float
              CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
  
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
  
              ggml_fp16_to_fp32_row(tmp, d, d_ne);
          }
@@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
  
      const int64_t ne10 = src1->ne[0];
      const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
  
      const int nb2  = dst->nb[2];
      const int nb3  = dst->nb[3];
      const ggml_type type = src0->type;
      const bool mul_mat_vec = ne11 == 1;
  
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
      const float alpha = 1.0f;
      const float beta = 0.0f;
      const int x_ne = ne01 * ne00;
@@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
      size_t ev_idx = 0;
      std::vector<cl_event> events;
  
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
              // copy src0 to device if necessary
              if (src0->backend == GGML_BACKEND_CPU) {
-                events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+                if (i02 != pi02 || i03 != pi03) {
+                    events.emplace_back();
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+                    pi02 = i02;
+                    pi03 = i03;
+                }
              } else if (src0->backend == GGML_BACKEND_GPU) {
                  d_Q = (cl_mem) src0->extra;
              } else {
@@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
              if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                  // copy src1 to device
                  events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
  
                  // compute
                  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
  
                  // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
  
                  events.emplace_back();
  
@@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
              }
  
              // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
              CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
              for (auto *event : events) {
                  clReleaseEvent(event);
diff --git a/ggml.c b/ggml.c

index 820fe2e74b0ae7058aa7d930c2e428623c8044a1..bf1426d2558ea2d973b2d71b59bb342604a70dcc 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -11621,11 +11621,6 @@ static void ggml_compute_forward_mul_mat(
  
  #if defined(GGML_USE_CLBLAST)
      if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
-        //       ref: https://github.com/ggerganov/ggml/pull/224
-        GGML_ASSERT(ne02 == ne12);
-        GGML_ASSERT(ne03 == ne13);
-
          if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
              ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
          }
author	shibe2 <redacted>
	Mon, 2 Oct 2023 19:26:15 +0000 (23:26 +0400)
committer	GitHub <redacted>
	Mon, 2 Oct 2023 19:26:15 +0000 (21:26 +0200)
ggml-opencl.cpp		patch \| blob \| history
ggml.c		patch \| blob \| history