From: Georgi Gerganov Date: Fri, 18 Jul 2025 11:31:15 +0000 (+0300) Subject: graph : avoid huge warm-up graphs for MoE models (#14753) X-Git-Tag: upstream/0.0.6073~140 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=d498af3d5a00f96bdd37b534860f03a6d9e98d39;p=pkg%2Fggml%2Fsources%2Fllama.cpp graph : avoid huge warm-up graphs for MoE models (#14753) * graph : avoid huge warm-up graphs for MoE models ggml-ci * cont : bump max nodes to 8x model tensors --- diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1af19caa..6eb34473 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { // uint32_t llama_context::graph_max_nodes() const { - return std::max(65536u, 5u*model.n_tensors()); + return std::max(1024u, 8u*model.n_tensors()); } llm_graph_result * llama_context::get_gf_res_reserve() const { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7ea7fd61..7cac3b98 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn( } // aggregate experts + // note: here we explicitly use hparams.n_expert_used instead of n_expert_used + // to avoid potentially a large number of add nodes during warmup + // ref: https://github.com/ggml-org/llama.cpp/pull/14753 ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { + for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); @@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( } } - if (n_expert_used == 1) { + if (hparams.n_expert_used == 1) { // avoid returning a non-contiguous tensor moe_out = ggml_cont(ctx0, moe_out); }