memory : fix kv cache size for hybrid models (#19559)

author Georgi Gerganov <redacted>

Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)

committer GitHub <redacted>

Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)
author Georgi Gerganov <redacted>
Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)
committer GitHub <redacted>
Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 5816e9a954cc679dffa3af2f2ec65c63811220be..6b7da69e9d07401911c1c3081abef18eb5b0c09d 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7965,7 +7965,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                              cparams.n_seq_max,
                              nullptr);
                  } else if (llm_arch_is_hybrid(arch)) {
-
                      // The main difference between hybrid architectures is the
                      // layer filters, so pick the right one here
                      llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
@@ -7990,7 +7989,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                              /* attn_type_v       */ params.type_v,
                              /* attn_v_trans      */ !cparams.flash_attn,
                              /* attn_swa_full     */ params.swa_full,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                              /* attn_n_ubatch     */ cparams.n_ubatch,
                              /* attn_n_pad        */ 1,
                              /* recurrent_type_r  */ GGML_TYPE_F32,
@@ -8007,7 +8006,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                              /* attn_type_k       */ params.type_k,
                              /* attn_type_v       */ params.type_v,
                              /* attn_v_trans      */ !cparams.flash_attn,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                              /* attn_n_pad        */ 1,
                              /* attn_n_swa        */ hparams.n_swa,
                              /* attn_swa_type     */ hparams.swa_type,
author	Georgi Gerganov <redacted>
	Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)
committer	GitHub <redacted>
	Fri, 13 Feb 2026 05:36:24 +0000 (07:36 +0200)