context : fix init of n_outputs (#12397)

author Georgi Gerganov <redacted>

Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)

committer GitHub <redacted>

Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)
author Georgi Gerganov <redacted>
Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)
committer GitHub <redacted>
Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index c2fcce42a7d58bae98f898f8e5dd4c5e3271fb10..abb7e526f61711154bff3de32f1909a5735d3a88 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -285,11 +285,15 @@ llama_context::llama_context(
  
      // reserve worst-case graph
      if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  
          llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  
+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
          // max number of outputs
          n_outputs = n_tokens;
  
@@ -341,6 +345,8 @@ llama_context::llama_context(
              }
          }
  
+        n_outputs = n_outputs_save;
+
          for (size_t i = 0; i < backend_ptrs.size(); ++i) {
              ggml_backend_t             backend = backend_ptrs[i];
              ggml_backend_buffer_type_t buft    = backend_buft[i];
author	Georgi Gerganov <redacted>
	Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)
committer	GitHub <redacted>
	Sun, 16 Mar 2025 17:29:36 +0000 (19:29 +0200)