llama : skip output reordering for single token batches (#17466)

author Daniel Bevenius <redacted>

Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)

committer GitHub <redacted>

Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)
author Daniel Bevenius <redacted>
Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)
committer GitHub <redacted>
Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 70a3ec62dfc635402e9e723960c3758ffb4cebfa..2aa6d52a242a0d4b93f7ec3483bffe0c9edfd8de 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1248,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
  
          // make the outputs have the same order they had in the user-provided batch
          // note: this is mostly relevant for recurrent models atm
-        if (!sorted_output) {
+        if (!sorted_output && n_outputs > 1) {
              GGML_ASSERT((size_t) n_outputs == out_ids.size());
  
              // TODO: is there something more efficient which also minimizes swaps?
author	Daniel Bevenius <redacted>
	Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)
committer	GitHub <redacted>
	Mon, 24 Nov 2025 20:06:17 +0000 (21:06 +0100)