CANN: Fix precision issue on 310I DUO multi-devices (#15784)

author hipudding <redacted>

Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)

committer GitHub <redacted>

Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
author hipudding <redacted>
Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
committer GitHub <redacted>
Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md

index 2d8866e3bd495d7d5cb60fa37c80ab75d194fbbb..357253f43a0ceaad4d28fb94be72998db1a7fb6c 100755 (executable)
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
  
  ## Environment variable setup
  
-### GGML_CANN_ASYNC_MODE
-
-Enables asynchronous operator submission. Disabled by default.
-
  ### GGML_CANN_MEM_POOL
  
-Specifies the memory pool management strategy:
+Specifies the memory pool management strategy, Default is vmm.
  
  - vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
  
  - prio: Employs a priority queue-based memory pool management.
+
  - leg: Uses a fixed-size buffer pool.
  
  ### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@@ -312,9 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
  
  ### GGML_CANN_WEIGHT_NZ
  
-Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
  
-### GGML_CANN_DISABLE_ACL_GRAPH
+### GGML_CANN_ACL_GRAPH
  
-When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
-This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
+Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp

index 5c6163ad4417c0a879c1a233fce1e70119610b70..2d81fbd5a185b11704ece8997d956b9670e902f1 100755 (executable)
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1956,7 +1956,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
      aclTensor* acl_weight_tensor;
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
      if (weight_to_nz && is_matmul_weight(weight)) {
          int64_t acl_stride[2] = {1, transpose_ne[1]};
  
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h

index a041a157c333a8b20d3c49bbe55caab31d51f1e6..e295f4ab472102e31bbd8757e9d3458f069a2c67 100755 (executable)
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
          GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
              device, async_mode ? "ON" : "OFF");
  #ifdef USE_ACL_GRAPH
-        acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
+        acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
          GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
                __func__, device,
                acl_graph_mode ? "GRAPH" : "EAGER",
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp

index 64fb2beff0aef05d75136d642306fd63561d38a1..1aa2913a61788d6f1f87df789795db6521c923c7 100755 (executable)
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
      // Why aclrtSynchronizeDevice?
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
      if (!need_transform(tensor->type)) {
          ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
                                ACL_MEMCPY_HOST_TO_DEVICE));
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
                                    ACL_MEMCPY_DEVICE_TO_DEVICE));
              return true;
          } else {
+#ifdef ASCEND_310P
+            // TODO: Support 310p P2P copy
+            return false;
+#endif
              // Different device but can access by peer.
              int32_t canAccessPeer = 0;
              ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
      int64_t ne0 = tensor->ne[0];
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
  
      // last line must bigger than 32, because every single op deal at
      // least 32 bytes.
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
      GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
                  ggml_backend_is_cann(backend_dst));
  
+    GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
+
      if (!ggml_backend_buffer_is_cann(src->buffer) ||
          !ggml_backend_buffer_is_cann(dst->buffer)) {
          return false;
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
          return true;
      }
      if (backend_src != backend_dst) {
+#ifdef ASCEND_310P
+        // TODO: Support 310p P2P copy
+        return false;
+#endif
          ggml_backend_cann_buffer_context* buf_ctx_src =
              (ggml_backend_cann_buffer_context*)buf_src->context;
          ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
          }
  
          // need open both directions for memcpyasync between devices.
-        ggml_cann_set_device(cann_ctx_dst->device);
          ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
          ggml_cann_set_device(cann_ctx_src->device);
          ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
                                     ACL_MEMCPY_DEVICE_TO_DEVICE,
                                     cann_ctx_src->stream()));
  
-        //TODO: workaround for Event didn`t work here.
-        aclrtSynchronizeStream(cann_ctx_src->stream());
+        // record event on src stream after the copy
+        if (!cann_ctx_src->copy_event) {
+            ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
+        }
+        ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+
+        // wait on dst stream for the copy to complete
+        ggml_cann_set_device(cann_ctx_dst->device);
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
      } else {
          // src and dst are on the same backend
          ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
author	hipudding <redacted>
	Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
committer	GitHub <redacted>
	Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
docs/backend/CANN.md		patch \| blob \| history
ggml/src/ggml-cann/aclnn_ops.cpp		patch \| blob \| history
ggml/src/ggml-cann/common.h		patch \| blob \| history
ggml/src/ggml-cann/ggml-cann.cpp		patch \| blob \| history