cann : add Ascend NPU support (#2336)

author Mengqing Cao <redacted>

Fri, 9 Aug 2024 12:21:56 +0000 (20:21 +0800)

committer GitHub <redacted>

Fri, 9 Aug 2024 12:21:56 +0000 (15:21 +0300)
author Mengqing Cao <redacted>
Fri, 9 Aug 2024 12:21:56 +0000 (20:21 +0800)
committer GitHub <redacted>
Fri, 9 Aug 2024 12:21:56 +0000 (15:21 +0300)
diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile

index 2b009e8f9bd26acad24f58b43ec97cc4d7d6e879..5b7595b53c53f65b5faee9b2174b47c484f995bf 100644 (file)
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
  # title of most generated pages and in a few other places.
  # The default value is: My Project.
  
-PROJECT_NAME           = "llama.cpp"
+PROJECT_NAME           = "whisper.cpp"
  
  # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
  # could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER         =
  # for a project that appears at the top of each page and should give viewer a
  # quick idea about the purpose of the project. Keep the description short.
  
-PROJECT_BRIEF          = "llama inference engine"
+PROJECT_BRIEF          = "Port of OpenAI's Whisper model in C/C++"
  
  # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
  # in the documentation. The maximum height of the logo should not exceed 55
diff --git a/src/whisper.cpp b/src/whisper.cpp

index 8d836908a72225e10832952a6ea63b8aa9702424..0744dba184385937d74e6f1fb3e2fb490be1b229 100644 (file)
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -29,6 +29,10 @@
  #include "openvino/whisper-openvino-encoder.h"
  #endif
  
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
  #include "ggml.h"
  #include "ggml-alloc.h"
  #include "ggml-backend.h"
@@ -1283,6 +1287,16 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
      }
  #endif
  
+#ifdef GGML_USE_CANN
+    if (params.use_gpu) {
+        WHISPER_LOG_INFO("%s: using CANN backend\n", __func__);
+        result = ggml_backend_cann_init(params.gpu_device);
+        if (!result) {
+            WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__);
+        }
+    }
+#endif
+
      return result;
  }
  
@@ -1335,6 +1349,10 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont
      result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
  #endif
  
+#ifdef GGML_USE_CANN
+    result || (result == ggml_backend_cann_buffer_type(params.gpu_device));
+#endif
+
      result || (result = ggml_backend_cpu_buffer_type());
  
      return result;
@@ -4337,8 +4355,8 @@ const char * whisper_print_system_info(void) {
      s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
      s += "CUDA = "      + std::to_string(ggml_cpu_has_cuda())      + " | ";
      s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
-    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())          ;
-
+    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
+    s += "CANN = "      + std::to_string(ggml_cpu_has_cann())             ;
      return s.c_str();
  }
  
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 1ed74e543dd9f993670189d304110c1d989058e0..2f4117a627de0f5582626da0893558237934aeb0 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1,7 +1,6 @@
  #include <ggml.h>
  #include <ggml-alloc.h>
  #include <ggml-backend.h>
-#include <ggml-backend-impl.h>
  
  #include <algorithm>
  #include <array>
@@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
                  im = nullptr;
              }
          }
+
          ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
          GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
+        // TODO: other cases
+        //#pragma omp parallel for
+        //for (int i = 0; i < tensor->ne[1]; i++) {
+        //    ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+        //        i * tensor->ne[0], 1, tensor->ne[0], im);
+        //}
+
          ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
      } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
          // This is going to create some weird integers though.
          ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
      } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
      }
  }
  
@@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                          tt.to_float(&buf[i], vq.data(), bs);
                          tv.insert(tv.end(), vq.begin(), vq.end());
                      } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                      }
                  }
              }
@@ -760,7 +767,7 @@ struct test_dup : public test_case {
      }
  
      test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne = {10, 10, 20, 1},
              std::array<int64_t, 4> permute = {0, 0, 0, 0})
          : type(type), ne(ne), permute(permute),
              _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
@@ -780,9 +787,11 @@ struct test_cpy : public test_case {
      const ggml_type type_src;
      const ggml_type type_dst;
      const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _src_use_permute;
  
      std::string vars() override {
-        return VARS_TO_STR3(type_src, type_dst, ne);
+        return VARS_TO_STR4(type_src, type_dst, ne, permute);
      }
  
      double max_nmse_err() override {
@@ -794,12 +803,17 @@ struct test_cpy : public test_case {
      }
  
      test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type_src(type_src), type_dst(type_dst), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
+          _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
  
      ggml_tensor * build_graph(ggml_context * ctx) override {
          ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
+        if (_src_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+        }
+        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
          ggml_tensor * out = ggml_cpy(ctx, src, dst);
          return out;
      }
@@ -1175,6 +1189,7 @@ struct test_soft_max : public test_case {
      }
  };
  
+
  // GGML_OP_ROPE
  struct test_rope : public test_case {
      const ggml_type type;
@@ -1267,6 +1282,32 @@ struct test_pool2d : public test_case {
      }
  };
  
+// GGML_OP_CONV_TRANSPOSE_1D
+struct test_conv_transpose_1d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+
+    const int s0; // stride
+    const int p0; // padding
+    const int d0; // dilation
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+    }
+
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int s0 = 1, int p0 = 0, int d0 = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
+        return out;
+    }
+};
+
  // GGML_OP_IM2COL
  struct test_im2col : public test_case {
      const ggml_type type_input;
@@ -1280,7 +1321,7 @@ struct test_im2col : public test_case {
      // padding
      const int p0;
      const int p1;
-    // dilatation
+    // dilation
      const int d0;
      const int d1;
      // mode
@@ -1393,7 +1434,7 @@ struct test_argsort : public test_case {
                      ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
                  }
              } else {
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
              }
          }
      }
@@ -1470,6 +1511,7 @@ struct test_group_norm : public test_case {
      const ggml_type type;
      const std::array<int64_t, 4> ne;
      const int32_t num_groups;
+    const float eps;
  
      std::string vars() override {
          return VARS_TO_STR3(type, ne, num_groups);
@@ -1477,12 +1519,13 @@ struct test_group_norm : public test_case {
  
      test_group_norm(ggml_type type = GGML_TYPE_F32,
              std::array<int64_t, 4> ne = {64, 64, 320, 1},
-            int32_t num_groups = 32)
-        : type(type), ne(ne), num_groups(num_groups) {}
+            int32_t num_groups = 32,
+            float eps = 1e-6f)
+        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
  
      ggml_tensor * build_graph(ggml_context * ctx) override {
          ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
+        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
          return out;
      }
  };
@@ -2053,6 +2096,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
          GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
          GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
          GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+        GGML_TYPE_BF16,
      };
  
      // unary ops
@@ -2097,6 +2141,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
  
      test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
      test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    // test cases for 1D im2col
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+
+    test_cases.emplace_back(new test_conv_transpose_1d());
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+
  
      test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
      test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
@@ -2110,12 +2167,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
      test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
      test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
      test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
      test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
      test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
  
      for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
          for (ggml_type type_dst : all_types) {
             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
          }
      }
  
@@ -2165,6 +2232,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
          test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
      }
  
+#if 1
      for (ggml_type type_a : base_types) {
          for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
              test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
@@ -2184,10 +2252,31 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
              test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
          }
      }
+#else
+    // m = a rows
+    // n = b rows
+    // k = cols
+    std::uniform_int_distribution<> dist_m(1, 128);
+    std::uniform_int_distribution<> dist_n(16, 128);
+    std::uniform_int_distribution<> dist_k(1, 16);
+    for (int i = 0; i < 1000; i++) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                int m = dist_m(rng);
+                int n = dist_n(rng);
+                int k = dist_k(rng) * ggml_blck_size(type_a);
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
+            }
+        }
+    }
+#endif
  
      for (ggml_type type_a : other_types) {
          for (ggml_type type_b : {GGML_TYPE_F32}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
          }
      }
  
@@ -2247,7 +2336,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
          for (int n = 0; n < 10; ++n) {
              int64_t ne0 = dist_ne0(rng);
              int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
          }
  
          exponent <<= 1;
@@ -2266,7 +2355,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
              }
          }
      }
-
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
      test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
      test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
      test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
@@ -2380,7 +2469,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
          return true;
      }
  
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
      return false;
  }
author	Mengqing Cao <redacted>
	Fri, 9 Aug 2024 12:21:56 +0000 (20:21 +0800)
committer	GitHub <redacted>
	Fri, 9 Aug 2024 12:21:56 +0000 (15:21 +0300)
ggml/src/ggml-cann/Doxyfile		patch \| blob \| history
src/whisper.cpp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history