context : fix reserve token padding to n_seqs (#18536)

author Georgi Gerganov <redacted>

Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)

committer GitHub <redacted>

Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)
author Georgi Gerganov <redacted>
Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)
committer GitHub <redacted>
Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 34dfcd4724bdd177d6a8acc960b13ac282390a7d..c8a90fe9eacd26a6060b8aca41367349e959c3cb 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1458,7 +1458,7 @@ ggml_cgraph * llama_context::graph_reserve(
  
      if (n_tokens % n_seqs != 0) {
          n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
-        n_outputs = std::min(n_outputs, n_tokens);
+        n_outputs = std::max(n_outputs, n_tokens);
  
          LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
      }
author	Georgi Gerganov <redacted>
	Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)
committer	GitHub <redacted>
	Sat, 3 Jan 2026 13:45:34 +0000 (15:45 +0200)