llama : set n_outputs to 1 to avoid 0 outputs mean-pooling (#15791)

author Daniel Bevenius <redacted>

Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)

committer GitHub <redacted>

Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)
author Daniel Bevenius <redacted>
Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)
committer GitHub <redacted>
Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 2de6fcf0cb20909aa5b393a564a4d68c69e578b1..6b3188be4bc015dc11db3939388ae21ec33de8fe 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -285,6 +285,9 @@ llama_context::llama_context(
          const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
          const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  
+        // avoid reserving graphs with zero outputs
+        n_outputs = 1;
+
          LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
  
          // resolve automatic Flash Attention use
@@ -1368,6 +1371,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
  
  ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
      LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
+    GGML_ASSERT(n_outputs >= 1);
  
      if (n_tokens % n_seqs != 0) {
          n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
author	Daniel Bevenius <redacted>
	Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)
committer	GitHub <redacted>
	Thu, 4 Sep 2025 13:40:44 +0000 (15:40 +0200)