graph : avoid huge warm-up graphs for MoE models (#14753)

author Georgi Gerganov <redacted>

Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)

committer GitHub <redacted>

Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)
author Georgi Gerganov <redacted>
Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)
committer GitHub <redacted>
Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 1af19caa39dab17a98aa9e5145b2d521e5bea7bb..6eb344736de6f398e768f2788d198df6eff24c98 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
  //
  
  uint32_t llama_context::graph_max_nodes() const {
-    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
  }
  
  llm_graph_result * llama_context::get_gf_res_reserve() const {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 7ea7fd6156e672e5ed8eeb039021713074277432..7cac3b98fa9cafffff7bb6909238c52fbe12f6da 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
      }
  
      // aggregate experts
+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+    //       to avoid potentially a large number of add nodes during warmup
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
      ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
          ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
                  experts->nb[2], i*experts->nb[1]);
  
@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          }
      }
  
-    if (n_expert_used == 1) {
+    if (hparams.n_expert_used == 1) {
          // avoid returning a non-contiguous tensor
          moe_out = ggml_cont(ctx0, moe_out);
      }
author	Georgi Gerganov <redacted>
	Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)
committer	GitHub <redacted>
	Fri, 18 Jul 2025 11:31:15 +0000 (14:31 +0300)
src/llama-context.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history