models : fix graph splits (#19866)

author Georgi Gerganov <redacted>

Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)

committer GitHub <redacted>

Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)
author Georgi Gerganov <redacted>
Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)
committer GitHub <redacted>
Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp

index 4d6bb83c14215dc6041a2947bab8e27d92ecfd96..83d11241f8df898f4f008f5192419a1fc89e9e4d 100644 (file)
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -116,6 +116,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
          cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
          cb(cur, "attn_norm", il);
  
+        ggml_build_forward_expand(gf, cur);
+
          // Check layer type by checking which tensors exist
          // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
          bool is_kda = (layer.ssm_a != nullptr);
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp

index 56eefd7de27ba15e79db0a96b8c9cbf159d03c38..bacf7a4c2eefc7b1a3f5b9b9b4c8b2270ec0ec67 100644 (file)
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -29,6 +29,8 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
          cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
          cb(cur, "attn_norm", il);
  
+        ggml_build_forward_expand(gf, cur);
+
          // Determine layer type and build appropriate attention mechanism
          if (hparams.is_recurrent(il)) {
              // Linear attention layer (gated delta net)
@@ -269,7 +271,6 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
      cb(state_update_target, "state_update_target", il);
  
      ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
-    cb(conv_states_all, "conv_states_updated", il);
  
      ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
      state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp

index c7295e3364f5e3d4eee6db78ba63b7a7a0e9d0e7..77f18b5aeb8e4abdc14d60b3fd857fd6097342f9 100644 (file)
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -29,6 +29,8 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
          cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
          cb(cur, "attn_norm", il);
  
+        ggml_build_forward_expand(gf, cur);
+
          // Determine layer type and build appropriate attention mechanism
          if (hparams.is_recurrent(il)) {
              // Linear attention layer (gated delta net)
@@ -269,7 +271,6 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
      cb(state_update_target, "state_update_target", il);
  
      ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
-    cb(conv_states_all, "conv_states_updated", il);
  
      ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
      state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp

index 974120ea6f2b99b89f17cacd01398c2a97cc4a16..9d3a68cfe5e86c6d7966e7b25b6cbcf224147777 100644 (file)
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -21,6 +21,8 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
          cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
          cb(cur, "attn_norm", il);
  
+        ggml_build_forward_expand(gf, cur);
+
          // Determine layer type and build appropriate attention mechanism
          if (hparams.is_recurrent(il)) {
              // Linear attention layer (gated delta net)
@@ -354,7 +356,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
      cb(state_update_target, "state_update_target", il);
  
      ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
-    cb(conv_states_all, "conv_states_updated", il);
  
      ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
      state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
author	Georgi Gerganov <redacted>
	Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)
committer	GitHub <redacted>
	Tue, 24 Feb 2026 22:01:13 +0000 (00:01 +0200)
src/models/kimi-linear.cpp		patch \| blob \| history
src/models/qwen35.cpp		patch \| blob \| history
src/models/qwen35moe.cpp		patch \| blob \| history
src/models/qwen3next.cpp		patch \| blob \| history