CANN: Support Ascend310P to accelerate F32 and F16 Model (llama/10216)

author leo-pony <redacted>

Fri, 22 Nov 2024 06:07:20 +0000 (14:07 +0800)

committer Georgi Gerganov <redacted>

Sun, 8 Dec 2024 18:14:35 +0000 (20:14 +0200)
author leo-pony <redacted>
Fri, 22 Nov 2024 06:07:20 +0000 (14:07 +0800)
committer Georgi Gerganov <redacted>
Sun, 8 Dec 2024 18:14:35 +0000 (20:14 +0200)
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt

index c8e15c6d40ce9171103ab3face52bd8da9c5e239..756200b893d0285e551be995a47f321a680a69ca 100644 (file)
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
      message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
  endif()
  
+# Auto-detech Soc type and Soc version, if detect failed, will abort build
+set(SOC_VERSION "")
+function(detect_ascend_soc_type SOC_VERSION)
+    execute_process(
+        COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
+        OUTPUT_VARIABLE npu_info
+        RESULT_VARIABLE npu_result
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if("${npu_info}" STREQUAL "" OR ${npu_result})
+        message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
+    endif()
+    set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
+endfunction()
+
+if(NOT SOC_TYPE)
+    detect_ascend_soc_type(SOC_VERSION)
+    set(SOC_TYPE "${SOC_VERSION}")
+    message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
+else()
+    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
+endif()
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
+set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+
  if (CANN_INSTALL_DIR)
      # Only Support Linux.
      if (NOT UNIX)
@@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
      target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
      target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
  
+    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+
      message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
      message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
  else()
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp

index a4ec8418e2ab35e3d253601ac269c11fccfa5308..1f4ee986ceb89c8569dd87ad1f860d636d2ac6a0 100644 (file)
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
  
      switch (src0->type) {
          case GGML_TYPE_F32:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 8) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
              aclrtlaunch_ascendc_get_row_f32(
                  24, ctx.stream(), src0->data, src1->data, dst->data,
                  ((ggml_tensor*)src0->extra)->ne,
@@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                  ((ggml_tensor*)dst->extra)->nb);
              break;
+        }
          case GGML_TYPE_F16:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 16) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
              aclrtlaunch_ascendc_get_row_f16(
                  24, ctx.stream(), src0->data, src1->data, dst->data,
                  ((ggml_tensor*)src0->extra)->ne,
@@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                  ((ggml_tensor*)dst->extra)->nb);
              break;
+        }
          case GGML_TYPE_Q4_0:
              aclrtlaunch_ascendc_get_row_q4_0(
                  24, ctx.stream(), src0->data, src1->data, dst->data,
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt

index 5b4fef91b5877111a06ec413e87ad029ec2e3bc0..6a4e17cce54c9d273c2d65f1461f82544291d13a 100644 (file)
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -1,7 +1,3 @@
-if (NOT SOC_TYPE)
-    set (SOC_TYPE "Ascend910B3")
-endif()
-
  file(GLOB SRC_FILES
      get_row_f32.cpp
      get_row_f16.cpp
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
      dup.cpp
  )
  
-string(TOLOWER ${SOC_TYPE} SOC_VERSION)
  set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
  set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
  
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
      ${SRC_FILES}
  )
  
+message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
  # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp

index e2c651152f48659f0ff19f23d071ccb9d728750b..99f03e05883aa24ffed147e936c354aa28a49b3b 100644 (file)
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -5,6 +5,7 @@
  using namespace AscendC;
  
  #define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
  
  template <typename SRC_T, typename DST_T>
  class DupByRows {
@@ -19,6 +20,7 @@ class DupByRows {
          // Input has four dims.
          int64_t op_block_num = GetBlockNum();
          int64_t op_block_idx = GetBlockIdx();
+        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
  
          // param
          num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@@ -51,24 +53,36 @@ class DupByRows {
  
      __aicore__ inline void copy_in() {
          LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
-        DataCopyPadExtParams<SRC_T> padParams;
-        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
-
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
          src_queue.EnQue(src_local);
      }
  
      __aicore__ inline void copy_out() {
          LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
          DataCopyExtParams dataCopyParams;
          dataCopyParams.blockCount = 1;
          dataCopyParams.blockLen = num_elem * sizeof(DST_T);
          DataCopyPad(dst_gm, dst_local, dataCopyParams);
-
+#endif
          dst_queue.FreeTensor(dst_local);
      }
  
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp

index c704b5b2ec0f39d638fce755ecd6892af4cd5979..416b45104de5bb62c42a5dc34064528998c2c1dc 100644 (file)
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
                                  int64_t *output_ne_ub, size_t *output_nb_ub) {
          // TODO, use template for F16/f32
          int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
  
          for (int i = 0; i < 4; i++) {
              input_ne[i] = input_ne_ub[i];
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
      }
  
      __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
          LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
          if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(half);
-            DataCopyPadExtParams<half> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
          }
+        DataCopy(input_local, input_gm[offset], len);
          input_queue.EnQue(input_local);
      }
  
      __aicore__ inline void copy_out(uint32_t offset, size_t len) {
          LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
          if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
              DataCopyExtParams dataCopyParams;
              dataCopyParams.blockCount = 1;
              dataCopyParams.blockLen = tail * sizeof(float);
              DataCopyPad(output_gm[offset + len], output_local[len],
                          dataCopyParams);
+#endif
          }
          output_queue.FreeTensor(output_local);
      }
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
      GlobalTensor<float> output_gm;
      TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
      TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
  };
  
  template <typename T>
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp

index 9db080af3699874c18f355c74f66f4283de8e527..02116905b18e4e6993cb8c2eb82cecc17438f624 100644 (file)
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
                                  int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                  int64_t *output_ne_ub, size_t *output_nb_ub) {
          int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
  
          for (int i = 0; i < 4; i++) {
              input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
  
      __aicore__ inline void copy_in(uint32_t offset, size_t len) {
          LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
          if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPadExtParams<float> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
          }
+        DataCopy(input_local, input_gm[offset], len);
          input_queue.EnQue(input_local);
      }
  
      __aicore__ inline void copy_out(uint32_t offset, size_t len) {
          LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
          if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
              DataCopyExtParams dataCopyParams;
              dataCopyParams.blockCount = 1;
              dataCopyParams.blockLen = tail * sizeof(float);
              DataCopyPad(output_gm[offset + len], output_local[len],
                          dataCopyParams);
+#endif
          }
          output_queue.FreeTensor(output_local);
      }
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
      GlobalTensor<float> output_gm;
      TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
      TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
  };
  
  template <typename T>
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp

index a80bfeec2417d9c81ecc16f7c96af38ed0768575..377211096e1f51b0cfb08b76f1d7db058a6d7984 100644 (file)
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
          LocalTensor<float> output_local = output_queue.AllocTensor<float>();
  
          // TODO: cast more data to speed up.
+#ifdef ASCEND_310P
+        // TODO: 310P support quantification
+#else
          Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
          Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
+#endif
          // Only mul need compile by group.
          half scale = scale_gm.GetValue(scale_offset);
author	leo-pony <redacted>
	Fri, 22 Nov 2024 06:07:20 +0000 (14:07 +0800)
committer	Georgi Gerganov <redacted>
	Sun, 8 Dec 2024 18:14:35 +0000 (20:14 +0200)
ggml/src/ggml-cann/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cann/aclnn_ops.cpp		patch \| blob \| history
ggml/src/ggml-cann/kernels/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cann/kernels/dup.cpp		patch \| blob \| history
ggml/src/ggml-cann/kernels/get_row_f16.cpp		patch \| blob \| history
ggml/src/ggml-cann/kernels/get_row_f32.cpp		patch \| blob \| history
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp		patch \| blob \| history