From: Georgi Gerganov Date: Tue, 24 Feb 2026 22:01:13 +0000 (+0200) Subject: models : fix graph splits (#19866) X-Git-Tag: gguf-v0.18.0~26 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=244641955f6146f7e8474afff7772d427593a534;p=pkg%2Fggml%2Fsources%2Fllama.cpp models : fix graph splits (#19866) --- diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 4d6bb83c1..83d11241f 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -116,6 +116,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); + ggml_build_forward_expand(gf, cur); + // Check layer type by checking which tensors exist // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor bool is_kda = (layer.ssm_a != nullptr); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 56eefd7de..bacf7a4c2 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -29,6 +29,8 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); + ggml_build_forward_expand(gf, cur); + // Determine layer type and build appropriate attention mechanism if (hparams.is_recurrent(il)) { // Linear attention layer (gated delta net) @@ -269,7 +271,6 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( cb(state_update_target, "state_update_target", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); - cb(conv_states_all, "conv_states_updated", il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index c7295e336..77f18b5ae 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -29,6 +29,8 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); + ggml_build_forward_expand(gf, cur); + // Determine layer type and build appropriate attention mechanism if (hparams.is_recurrent(il)) { // Linear attention layer (gated delta net) @@ -269,7 +271,6 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( cb(state_update_target, "state_update_target", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); - cb(conv_states_all, "conv_states_updated", il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 974120ea6..9d3a68cfe 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -21,6 +21,8 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); + ggml_build_forward_expand(gf, cur); + // Determine layer type and build appropriate attention mechanism if (hparams.is_recurrent(il)) { // Linear attention layer (gated delta net) @@ -354,7 +356,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(state_update_target, "state_update_target", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); - cb(conv_states_all, "conv_states_updated", il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);