CANN: Fix precision issue on 310I DUO multi-devices (llama/15784)

author hipudding <redacted>

Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)

committer Georgi Gerganov <redacted>

Fri, 5 Sep 2025 09:54:13 +0000 (12:54 +0300)
author hipudding <redacted>
Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
committer Georgi Gerganov <redacted>
Fri, 5 Sep 2025 09:54:13 +0000 (12:54 +0300)
diff --git a/src/ggml-cann/aclnn_ops.cpp b/src/ggml-cann/aclnn_ops.cpp

index 5c6163ad4417c0a879c1a233fce1e70119610b70..2d81fbd5a185b11704ece8997d956b9670e902f1 100755 (executable)
--- a/src/ggml-cann/aclnn_ops.cpp
+++ b/src/ggml-cann/aclnn_ops.cpp
@@ -1956,7 +1956,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
      aclTensor* acl_weight_tensor;
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
      if (weight_to_nz && is_matmul_weight(weight)) {
          int64_t acl_stride[2] = {1, transpose_ne[1]};
  
diff --git a/src/ggml-cann/common.h b/src/ggml-cann/common.h

index a041a157c333a8b20d3c49bbe55caab31d51f1e6..e295f4ab472102e31bbd8757e9d3458f069a2c67 100755 (executable)
--- a/src/ggml-cann/common.h
+++ b/src/ggml-cann/common.h
@@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
          GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
              device, async_mode ? "ON" : "OFF");
  #ifdef USE_ACL_GRAPH
-        acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
+        acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
          GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
                __func__, device,
                acl_graph_mode ? "GRAPH" : "EAGER",
diff --git a/src/ggml-cann/ggml-cann.cpp b/src/ggml-cann/ggml-cann.cpp

index 64fb2beff0aef05d75136d642306fd63561d38a1..1aa2913a61788d6f1f87df789795db6521c923c7 100755 (executable)
--- a/src/ggml-cann/ggml-cann.cpp
+++ b/src/ggml-cann/ggml-cann.cpp
@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
      // Why aclrtSynchronizeDevice?
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
      if (!need_transform(tensor->type)) {
          ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
                                ACL_MEMCPY_HOST_TO_DEVICE));
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
                                    ACL_MEMCPY_DEVICE_TO_DEVICE));
              return true;
          } else {
+#ifdef ASCEND_310P
+            // TODO: Support 310p P2P copy
+            return false;
+#endif
              // Different device but can access by peer.
              int32_t canAccessPeer = 0;
              ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
      int64_t ne0 = tensor->ne[0];
  
      // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
  
      // last line must bigger than 32, because every single op deal at
      // least 32 bytes.
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
      GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
                  ggml_backend_is_cann(backend_dst));
  
+    GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
+
      if (!ggml_backend_buffer_is_cann(src->buffer) ||
          !ggml_backend_buffer_is_cann(dst->buffer)) {
          return false;
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
          return true;
      }
      if (backend_src != backend_dst) {
+#ifdef ASCEND_310P
+        // TODO: Support 310p P2P copy
+        return false;
+#endif
          ggml_backend_cann_buffer_context* buf_ctx_src =
              (ggml_backend_cann_buffer_context*)buf_src->context;
          ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
          }
  
          // need open both directions for memcpyasync between devices.
-        ggml_cann_set_device(cann_ctx_dst->device);
          ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
          ggml_cann_set_device(cann_ctx_src->device);
          ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
                                     ACL_MEMCPY_DEVICE_TO_DEVICE,
                                     cann_ctx_src->stream()));
  
-        //TODO: workaround for Event didn`t work here.
-        aclrtSynchronizeStream(cann_ctx_src->stream());
+        // record event on src stream after the copy
+        if (!cann_ctx_src->copy_event) {
+            ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
+        }
+        ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+
+        // wait on dst stream for the copy to complete
+        ggml_cann_set_device(cann_ctx_dst->device);
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
      } else {
          // src and dst are on the same backend
          ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
author	hipudding <redacted>
	Thu, 4 Sep 2025 07:12:30 +0000 (15:12 +0800)
committer	Georgi Gerganov <redacted>
	Fri, 5 Sep 2025 09:54:13 +0000 (12:54 +0300)
src/ggml-cann/aclnn_ops.cpp		patch \| blob \| history
src/ggml-cann/common.h		patch \| blob \| history
src/ggml-cann/ggml-cann.cpp		patch \| blob \| history