cann: fix buffer_num and runtime speed slowly error (#8865)

author wangshuai09 <redacted>

Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)

committer GitHub <redacted>

Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)
author wangshuai09 <redacted>
Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)
committer GitHub <redacted>
Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp

index a15bc8aa29fcb7649ab8de260e5b8d05237053c7..81783b7b147dd48397390c77b18ab796c1e7c058 100644 (file)
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1670,10 +1670,6 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
                      // TODO: fix me
                      // Current groupsize should not be greater than k-1 in
                      // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
-                    if (op->src[0]->ne[0]-1 > QK8_0) {
-                        return true;
-                    }
-                    return false;
                  case GGML_TYPE_Q4_0:
                      return true;
                  default:
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp

index f6deee3c5d8ba5694b696520cfd8533ceaf29330..9c8c86b66ad66dcd8d4dc4be4f986a531e1bb1d1 100644 (file)
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
      __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                  int64_t *input_ne_ub, size_t *input_nb_ub,
                                  int64_t *output_ne_ub) {
+        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+        //                         permute=[0,0,0,0]):
+        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
          int64_t op_block_num = GetBlockNum();
          int64_t op_block_idx = GetBlockIdx();
  
@@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
          pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
          pipe.InitBuffer(output_queue, BUFFER_NUM,
                              Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , BUFFER_NUM, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(max_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(min_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(scale_queue, BUFFER_NUM, 16*sizeof(half));
-        pipe.InitBuffer(int8_queue, BUFFER_NUM, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, BUFFER_NUM, Group_Size * sizeof(half));
+        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
      }
  
      __aicore__ inline void copy_in(uint32_t offset) {
@@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
              for (int64_t j = 0; j < group_size_in_row; j++) {
                  half scale = calculate_group(i, j);
                  scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
+                // Copy Group_Size/2 length data each time.
+                if (scale_local_offset == Group_Size / 2) {
                      scale_local_offset = 0;
                      // TODO: OPTIMIZE ME
                      pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    DataCopy(scale_gm[scale_global_offset], scale_local,
+                                      Group_Size / 2);
                      pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
+                    scale_global_offset += Group_Size / 2;
                  }
              }
          }
author	wangshuai09 <redacted>
	Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)
committer	GitHub <redacted>
	Mon, 5 Aug 2024 13:10:37 +0000 (21:10 +0800)
ggml/src/ggml-cann.cpp		patch \| blob \| history
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp		patch \| blob \| history