From: Chenguang Li Date: Tue, 2 Sep 2025 06:07:48 +0000 (+0800) Subject: CANN: Support eager execution mode under ACL graph compilation (#15712) X-Git-Tag: upstream/0.0.6527~174 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=2f853687b3bce15e143a22f678d1715060fd606c;p=pkg%2Fggml%2Fsources%2Fllama.cpp CANN: Support eager execution mode under ACL graph compilation (#15712) * [CANN] Support eager execution mode under ACL graph compilation Add support for running operators in eager mode while ACL graph compilation is enabled. This allows bypassing graph execution and directly submitting ops, which is useful for debugging and reducing graph build overhead in certain scenarios. Signed-off-by: noemotiovon * fix typo Signed-off-by: noemotiovon * rename to acl_graph_mode Signed-off-by: noemotiovon --------- Signed-off-by: noemotiovon --- diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 325e09bd..2d8866e3 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -314,3 +314,7 @@ Controls automatic cleanup of the memory pool. This option is only effective whe Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU. +### GGML_CANN_DISABLE_ACL_GRAPH + +When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode. +This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index f71aa9d1..a041a157 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -395,6 +395,7 @@ struct ggml_backend_cann_context { #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. std::unique_ptr cann_graph; + bool acl_graph_mode = true; #endif cann_task_queue task_queue; bool async_mode; @@ -404,7 +405,6 @@ struct ggml_backend_cann_context { ggml_cann_tensor_cache rms_norm_one_tensor_cache; ggml_cann_tensor_cache rms_norm_zero_tensor_cache; - aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ /** @@ -419,6 +419,13 @@ struct ggml_backend_cann_context { async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF"); +#ifdef USE_ACL_GRAPH + acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or(""))); + GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", + __func__, device, + acl_graph_mode ? "GRAPH" : "EAGER", + acl_graph_mode ? "acl graph enabled" : "acl graph disabled"); +#endif } /** diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index da6d74d4..0d9eb8fa 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2252,6 +2252,10 @@ static enum ggml_status ggml_backend_cann_graph_compute( bool use_cann_graph = true; bool cann_graph_update_required = false; + if (!cann_ctx->acl_graph_mode) { + use_cann_graph = false; + } + if (use_cann_graph) { if (cann_ctx->cann_graph == nullptr) { cann_ctx->cann_graph.reset(new ggml_cann_graph());