ggml : sync llama.cpp (OpenCL support for GPU offload)

author Georgi Gerganov <redacted>

Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)

committer Georgi Gerganov <redacted>

Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)
author Georgi Gerganov <redacted>
Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)
committer Georgi Gerganov <redacted>
Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h

index 51a616c501bb3eb46bf2b20c727b0c0a0b7a16dc..c22d938363cf3f65acc5dc3e0b2f51dd477c6f42 100644 (file)
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -249,6 +249,7 @@ extern "C" {
      enum ggml_backend {
          GGML_BACKEND_CPU = 0,
          GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
      };
  
      // model file types
diff --git a/src/ggml-opencl.h b/src/ggml-opencl.h

index 7bcc603ef8432f90a48e279168ba577f2b8468fd..5a1a500930b9aa3bc8912e6d41fa8a5b741e8da7 100644 (file)
--- a/src/ggml-opencl.h
+++ b/src/ggml-opencl.h
@@ -1,23 +1,21 @@
  #pragma once
  
+#include "ggml.h"
+
  #ifdef  __cplusplus
  extern "C" {
  #endif
  
  void ggml_cl_init(void);
  
-enum ggml_blas_order {
-    GGML_BLAS_ORDER_ROW_MAJOR = 101,
-    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
-};
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
  
-enum ggml_blas_op {
-    GGML_BLAS_OP_N = 111,
-    GGML_BLAS_OP_T = 112,
-    GGML_BLAS_OP_C = 113,
-};
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
  
-void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
  
  #ifdef  __cplusplus
  }
diff --git a/src/ggml.c b/src/ggml.c

index 7612c86dcf06a634091fbc1af977601bed4b5394..66238f0fc0d9d1bd91c2b96e215dbfb7fdcadd1e 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -9432,7 +9432,7 @@ static void ggml_compute_forward_rms_norm_back(
  
  // ggml_compute_forward_mul_mat
  
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
  // helper function to determine if it is better to use BLAS or not
  // for large matrices, BLAS is faster
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9473,7 +9473,7 @@ static void ggml_compute_forward_mul_mat_f32(
      const int64_t ne02 = src0->ne[2];
      const int64_t ne03 = src0->ne[3];
  
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
      const int64_t ne10 = src1->ne[0];
  #endif
      const int64_t ne11 = src1->ne[1];
@@ -9537,9 +9537,16 @@ static void ggml_compute_forward_mul_mat_f32(
          }
          return;
      }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
  #endif
  
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
      if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
          if (params->ith != 0) {
              return;
@@ -9559,21 +9566,11 @@ static void ggml_compute_forward_mul_mat_f32(
                  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
                  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
  
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                          ne11, ne01, ne10,
                          1.0f,    y, ne10,
                                   x, ne00,
                          0.0f,    d, ne01);
-#endif
              }
          }
          //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9712,9 +9709,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
          }
          return;
      }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
  #endif
  
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
      if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
          GGML_ASSERT(nb10 == sizeof(float));
  
@@ -9744,20 +9748,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                      assert(id*sizeof(float) <= params->wsize);
                  }
  
-#if defined(GGML_USE_CLBLAST)
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                  const float * x = wdata;
                  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
  
@@ -9769,7 +9759,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                          1.0f,    y, ne10,
                                   x, ne00,
                          0.0f,    d, ne01);
-#endif
              }
          }
  
@@ -9932,9 +9921,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
          }
          return;
      }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
  #endif
  
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
      if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
          if (params->ith != 0) {
              return;
@@ -9957,9 +9953,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
  
                  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
  
-#if defined(GGML_USE_CLBLAST)
-                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
                  {
                      size_t id = 0;
                      for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9971,23 +9964,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
                  }
  
                  const float * x = wdata;
-#endif
  
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-#else
                  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                          ne11, ne01, ne10,
                          1.0f,    y, ne10,
                                   x, ne00,
                          0.0f,    d, ne01);
-#endif
              }
          }
  
@@ -14166,9 +14148,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                              cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                          }
                          else
+#elif defined(GGML_USE_CLBLAST)
+                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
  #endif
                          if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                              if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                  node->n_tasks = 1; // TODO: this actually is doing nothing
                                                     //       the threads are still spinning
@@ -14182,13 +14171,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  #endif
                          } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                              cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                              if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                  node->n_tasks = 1;
                              }
  #endif
                          } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                              if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                  node->n_tasks = 1;
                                  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -14632,9 +14621,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
              fprintf(fp, "%s |", node->name);
          }
  
-        fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
-                i, node->ne[0], node->ne[1],
-                GGML_OP_SYMBOL[node->op]);
+        if (node->n_dims == 2) {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
+        } else {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
+        }
+
  
          if (node->grad) {
              fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
author	Georgi Gerganov <redacted>
	Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)
committer	Georgi Gerganov <redacted>
	Sat, 27 May 2023 08:55:25 +0000 (11:55 +0300)
include/ggml/ggml.h		patch \| blob \| history
src/ggml-opencl.h		patch \| blob \| history
src/ggml.c		patch \| blob \| history