memory: respect unified KV cache in hybrid memory for eval tasks (#21224)

author Ettore Di Giacinto <redacted>

Wed, 1 Apr 2026 09:50:17 +0000 (11:50 +0200)

committer GitHub <redacted>

Wed, 1 Apr 2026 09:50:17 +0000 (12:50 +0300)
author Ettore Di Giacinto <redacted>
Wed, 1 Apr 2026 09:50:17 +0000 (11:50 +0200)
committer GitHub <redacted>
Wed, 1 Apr 2026 09:50:17 +0000 (12:50 +0300)
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp

index 411769672afb0d6417db7c182e6cc8e29d659dbd..10e6b45979729951e3ad7f578dc51ca9b1100553 100644 (file)
--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
                  // if all tokens are output, split by sequence
                  ubatch = balloc.split_seq(n_ubatch);
              } else {
-                // TODO: non-sequential equal split can be done if using unified KV cache
-                //       for simplicity, we always use sequential equal split for now
-                ubatch = balloc.split_equal(n_ubatch, true);
+                // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
+                const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
+                ubatch = balloc.split_equal(n_ubatch, !unified);
              }
  
              if (ubatch.n_tokens == 0) {
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp

index a1b45e4a3cce39ff808068926a087417d7604d7d..4ce1af592c15dfdce1c4a2ddadd707781474ef9c 100644 (file)
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
                  // if all tokens are output, split by sequence
                  ubatch = balloc.split_seq(n_ubatch);
              } else {
-                // TODO: non-sequential equal split can be done if using unified KV cache
-                //       for simplicity, we always use sequential equal split for now
-                ubatch = balloc.split_equal(n_ubatch, true);
+                // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
+                const bool unified = (mem_attn->get_n_stream() == 1);
+                ubatch = balloc.split_equal(n_ubatch, !unified);
              }
  
              if (ubatch.n_tokens == 0) {
author	Ettore Di Giacinto <redacted>
	Wed, 1 Apr 2026 09:50:17 +0000 (11:50 +0200)
committer	GitHub <redacted>
	Wed, 1 Apr 2026 09:50:17 +0000 (12:50 +0300)
src/llama-memory-hybrid-iswa.cpp		patch \| blob \| history
src/llama-memory-hybrid.cpp		patch \| blob \| history