## Environment variable setup
-### GGML_CANN_ASYNC_MODE
-
-Enables asynchronous operator submission. Disabled by default.
-
### GGML_CANN_MEM_POOL
-Specifies the memory pool management strategy:
+Specifies the memory pool management strategy, Default is vmm.
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
- prio: Employs a priority queue-based memory pool management.
+
- leg: Uses a fixed-size buffer pool.
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
### GGML_CANN_WEIGHT_NZ
-Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
-### GGML_CANN_DISABLE_ACL_GRAPH
+### GGML_CANN_ACL_GRAPH
-When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
-This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
+Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
// Why aclrtSynchronizeDevice?
// Only check env once.
- static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
ACL_MEMCPY_DEVICE_TO_DEVICE));
return true;
} else {
+#ifdef ASCEND_310P
+ // TODO: Support 310p P2P copy
+ return false;
+#endif
// Different device but can access by peer.
int32_t canAccessPeer = 0;
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
int64_t ne0 = tensor->ne[0];
// Only check env once.
- static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
// last line must bigger than 32, because every single op deal at
// least 32 bytes.
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
ggml_backend_is_cann(backend_dst));
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
+
if (!ggml_backend_buffer_is_cann(src->buffer) ||
!ggml_backend_buffer_is_cann(dst->buffer)) {
return false;
return true;
}
if (backend_src != backend_dst) {
+#ifdef ASCEND_310P
+ // TODO: Support 310p P2P copy
+ return false;
+#endif
ggml_backend_cann_buffer_context* buf_ctx_src =
(ggml_backend_cann_buffer_context*)buf_src->context;
ggml_backend_cann_buffer_context* buf_ctx_dst =
}
// need open both directions for memcpyasync between devices.
- ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_src->stream()));
- //TODO: workaround for Event didn`t work here.
- aclrtSynchronizeStream(cann_ctx_src->stream());
+ // record event on src stream after the copy
+ if (!cann_ctx_src->copy_event) {
+ ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
+ }
+ ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+
+ // wait on dst stream for the copy to complete
+ ggml_cann_set_device(cann_ctx_dst->device);
+ ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
} else {
// src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,