CANN: Update several operators to support FP16 data format (llama/16251)

author hipudding <redacted>

Mon, 13 Oct 2025 00:52:22 +0000 (08:52 +0800)

committer Georgi Gerganov <redacted>

Wed, 15 Oct 2025 06:29:17 +0000 (09:29 +0300)
author hipudding <redacted>
Mon, 13 Oct 2025 00:52:22 +0000 (08:52 +0800)
committer Georgi Gerganov <redacted>
Wed, 15 Oct 2025 06:29:17 +0000 (09:29 +0300)
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp

index 434023dd22ab3f9062de06935f9c1b1500c4a52e..240e8a1b2c0251713745f47d30805b3936ab978b 100755 (executable)
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -894,14 +894,13 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
  }
  
  /**
- * @brief Get or expand a cached float32 tensor filled with a scalar value.
+ * @brief Get or expand a cached tensor filled with a scalar value.
   *
- * This function manages cached device memory for float32 tensors. If the current
+ * This function manages cached device memory for tensors. If the current
   * cache size is insufficient for the requested tensor shape, the old memory will
- * be released and new memory will be allocated. The allocated buffer is then
- * initialized either with zeros (when @p value == 0.0f) or with the given scalar
- * value using CANN operations. Finally, an aclTensor object is created from the
- * cached memory and returned.
+ * be released and new memory will be allocated. The allocated buffer is
+ * initialized  with the given scalar value using CANN operations.
+ * Finally, an aclTensor object is created from the cached memory and returned.
   *
   * @param ctx           The CANN backend context that manages device memory.
   * @param buffer        A pointer to the cached device buffer (will be allocated
@@ -910,17 +909,19 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
   *                      updated when the cache is expanded.
   * @param ne            The tensor shape array (number of elements in each dimension).
   * @param nb            The stride size for each dimension.
+ * @param dtype         Data type of cached tensor.
   * @param dims          The number of tensor dimensions.
   * @param value         The scalar value used to fill the tensor (supports zero
   *                      initialization via memset or arbitrary values via fill_scalar).
   * @return              An aclTensor pointer created from the cached buffer.
   */
-static aclTensor* get_f32_cache_acl_tensor(
+static aclTensor* get_cache_acl_tensor(
      ggml_backend_cann_context& ctx,
      void** buffer,
      int64_t &cache_element,
      int64_t* ne,
      size_t* nb,
+    ggml_type dtype,
      int64_t dims,
      float value) {
      // Calculate total number of elements
@@ -928,7 +929,7 @@ static aclTensor* get_f32_cache_acl_tensor(
      for (int i = 0; i < dims; i++) {
          n_element *= ne[i];
      }
-    size_t size = n_element * sizeof(float);
+    size_t size = n_element * ggml_type_size(dtype);
  
      // Allocate or expand cache if needed
      if (cache_element < n_element) {
@@ -941,19 +942,17 @@ static aclTensor* get_f32_cache_acl_tensor(
          cache_element = n_element;
  
          // Initialize cache
-        if (value == 0.0f) {
-            ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
-        } else {
-            int64_t pool_ne[1] = { n_element };
-            size_t pool_nb[1] = { sizeof(float) };
-            aclTensor* acl_value = ggml_cann_create_tensor(
-                *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
-            aclnn_fill_scalar(ctx, 1, acl_value);
-            ggml_cann_release_resources(ctx, acl_value);
-        }
+        int64_t pool_ne[1] = { n_element };
+        size_t pool_nb[1] = { ggml_type_size(dtype) };
+        aclTensor* acl_value = ggml_cann_create_tensor(
+            *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype),
+            pool_ne, pool_nb, 1);
+        aclnn_fill_scalar(ctx, value, acl_value);
+        ggml_cann_release_resources(ctx, acl_value);
      }
  
-    return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
+    return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype),
+            ggml_type_size(dtype), ne, nb, dims);
  }
  
  void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -965,35 +964,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
      float eps;
      memcpy(&eps, dst->op_params, sizeof(float));
  
-    // build gamma, one...
+    // build gamma.
      size_t acl_gamma_nb[GGML_MAX_DIMS];
-    acl_gamma_nb[0] = sizeof(float);
+    // gamma's type is the same with dst.
+    acl_gamma_nb[0] = ggml_type_size(dst->type);
      for (int i = 1; i < GGML_MAX_DIMS; i++) {
          acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
      }
-    aclTensor* acl_gamma = get_f32_cache_acl_tensor(
+    aclTensor* acl_gamma = get_cache_acl_tensor(
          ctx,
          &ctx.rms_norm_one_tensor_cache.cache,
          ctx.rms_norm_one_tensor_cache.size,
          src->ne,
          acl_gamma_nb,
+        dst->type,
          1,        // dims
          1.0f      // value
      );
  
-    // build rstd, zero...
+    // build rstd.
      int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
      size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
+    // rstd will always be F32.
      acl_rstd_nb[0] = sizeof(float);
      for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
          acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
      }
-    aclTensor* acl_rstd = get_f32_cache_acl_tensor(
+    aclTensor* acl_rstd = get_cache_acl_tensor(
          ctx,
          &ctx.rms_norm_zero_tensor_cache.cache,
          ctx.rms_norm_zero_tensor_cache.size,
          acl_rstd_ne,
          acl_rstd_nb,
+        GGML_TYPE_F32,
          GGML_MAX_DIMS - 1,
          0.0f      // value
      );
@@ -1765,33 +1768,35 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
      ggml_tensor* src0 = dst->src[0];  // src
      ggml_tensor* src1 = dst->src[1];  // index
  
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
      switch (src0->type) {
-        case GGML_TYPE_F32: {
-            aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
-            break;
-        }
-        case GGML_TYPE_F16: {
-            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
-            ggml_cann_pool_alloc src_buffer_allocator(
-                ctx.pool(), ggml_nelements(src0) * sizeof(float));
-            void* src_trans_buffer = src_buffer_allocator.get();
-            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = sizeof(float);
-            for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            if(src0->type == dst->type) {
+                aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
+                                    dst->data, dst->ne, dst->nb,
+                                    src1, dst->type);
+            } else {
+                aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = dst->nb[0];
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+                    src0->ne, src_trans_nb, GGML_MAX_DIMS);
+                aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
+                aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
+                                    dst->data, dst->ne, dst->nb,
+                                    src1, dst->type);
+                ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
              }
-            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
-                src0->ne, src_trans_nb, GGML_MAX_DIMS);
-            aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
-            aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
-            ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
              break;
-        }
          case GGML_TYPE_Q8_0: {
              // add 1 dim for bcast mul.
              size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
@@ -1799,7 +1804,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
              int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
                  *dequant_ne;
              int64_t scale_offset = 0;
-
              // [3,4,5,64] -> [3,4,5,2,32]
              weight_ne[0] = QK8_0;
              weight_ne[1] = src0->ne[0] / QK8_0;
@@ -1809,7 +1813,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                  weight_ne[i] = src0->ne[i - 1];
                  weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
              }
-
              // [3,4,5,64] -> [3,4,5,2,1]
              scale_ne[0] = 1;
              scale_ne[1] = src0->ne[0] / QK8_0;
@@ -1819,18 +1822,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                  scale_ne[i] = src0->ne[i - 1];
                  scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
              }
-
              // [3,4,5,64] -> [3,4,5,2,32]
              dequant_ne = weight_ne;
-            dequant_nb[0] = sizeof(float);
+            dequant_nb[0] = ggml_type_size(dst->type);
              for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
                  dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
              }
-
              scale_offset = ggml_nelements(src0) * sizeof(int8_t);
              ggml_cann_pool_alloc dequant_buffer_allocator(
-                ctx.pool(), ggml_nelements(src0) * sizeof(float));
-
+                ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type));
              aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                  src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
                  GGML_MAX_DIMS + 1);
@@ -1838,16 +1838,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                  src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
                  GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
              aclTensor* dequant_tensor = ggml_cann_create_tensor(
-                dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
+                dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
                  dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
-
              aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
-            dequant_nb[0] = sizeof(float);
+            dequant_nb[0] = ggml_type_size(dst->type);
              dequant_ne = src0->ne;
              for (int i = 1; i < GGML_MAX_DIMS; i++) {
                  dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
              }
-
              aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
                                     dequant_ne, dequant_nb,
                                     dst->data, dst->ne, dst->nb,
@@ -1965,16 +1963,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
      // Only check env once.
      static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
      if (weight_to_nz && is_matmul_weight(weight)) {
-        int64_t acl_stride[2] = {1, transpose_ne[1]};
-
-        // Reverse ne.
-        std::reverse(transpose_ne, transpose_ne + n_dims);
-
-        std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
-
-        acl_weight_tensor = aclCreateTensor(
-            transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
-            0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
+        acl_weight_tensor =
+            ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
      } else {
          acl_weight_tensor =
              ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
@@ -3178,7 +3168,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
          aclTensor* acl_src0_f16_tensor = nullptr;
          aclTensor* acl_src1_f16_tensor = nullptr;
          aclTensor* acl_src2_f16_tensor = nullptr;
-        aclTensor* acl_dst_f16_tensor  = nullptr;
  
          // Step 1: cast the src0 (Query) to fp16 if needed
          ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
@@ -3216,22 +3205,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
          acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
              src2_bsnd_nb, GGML_MAX_DIMS);
  
-        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
-        void* out_f16_buffer = out_f16_allocator.alloc(
-                                    ggml_nelements(dst) * faElemSize);
-
-        int64_t* out_f16_ne = src0_bsnd_ne;
-        size_t out_f16_nb[GGML_MAX_DIMS];
-        out_f16_nb[0] = faElemSize;
-        for(int i = 1; i < GGML_MAX_DIMS; ++i){
-            out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
-        }
-
-        acl_dst_f16_tensor = ggml_cann_create_tensor(
-            out_f16_buffer, faDataType, faElemSize,
-            out_f16_ne, out_f16_nb, GGML_MAX_DIMS
-        );
-
          // Step 3: create the PSEShift tensor if needed
          //         this tensor is considered as mask (f16) in the llama.cpp
          aclTensor* bcast_pse_tensor = nullptr;
@@ -3334,8 +3307,29 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
          int64_t keyAntiquantMode = 0;
          int64_t valueAntiquantMode = 0;
  
-        // Step 5: launch the FusedInferAttentionScoreV2 kernel.
-        // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
+        GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+        aclTensor * fa_dst_tensor = nullptr;
+        aclTensor * acl_dst_tensor = nullptr;
+        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
+        if (dst->type == GGML_TYPE_F32) {
+            void* out_f16_buffer = out_f16_allocator.alloc(
+                                        ggml_nelements(dst) * faElemSize);
+
+            int64_t* out_f16_ne = src0_bsnd_ne;
+            size_t out_f16_nb[GGML_MAX_DIMS];
+            out_f16_nb[0] = faElemSize;
+            for(int i = 1; i < GGML_MAX_DIMS; ++i){
+                out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
+            }
+
+            fa_dst_tensor = ggml_cann_create_tensor(
+                out_f16_buffer, faDataType, faElemSize,
+                out_f16_ne, out_f16_nb, GGML_MAX_DIMS
+            );
+        }
+        else {
+            fa_dst_tensor = ggml_cann_create_tensor(dst);
+        }
  
          GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
              acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
@@ -3357,23 +3351,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
              blockSize, antiquantMode, // blockSize, antiquantMode
              softmaxLseFlag, // softmaxLseFlag
              keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
-            acl_dst_f16_tensor, // attentionOut
+            fa_dst_tensor, // attentionOut
              nullptr // softmaxLse
          );
  
-        // Step 6: post-processing, permute and cast to f32
-        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-        // TODO: when dst is fp16, don't need cast
-        aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
-        ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
-                                         acl_src1_f16_tensor,
-                                         acl_src2_f16_tensor,
-                                         acl_dst_f16_tensor,
-                                         acl_dst_tensor);
-        if(src3 != nullptr){
-            ggml_cann_release_resources(ctx, bcast_pse_tensor);
+        if (dst->type == GGML_TYPE_F32) {
+            // Step 6: post-processing, permute and cast to f32
+            aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+            aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
          }
-    }else{
+
+        ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
+                                        acl_src1_f16_tensor,
+                                        acl_src2_f16_tensor,
+                                        fa_dst_tensor,
+                                        acl_dst_tensor,
+                                        bcast_pse_tensor);
+
+    } else {
          GGML_ABORT("Function is not implemented.");
      }
  }
author	hipudding <redacted>
	Mon, 13 Oct 2025 00:52:22 +0000 (08:52 +0800)
committer	Georgi Gerganov <redacted>
	Wed, 15 Oct 2025 06:29:17 +0000 (09:29 +0300)