llama-cli: prevent spurious assistant token (#16202)

author Vinkal <redacted>

Mon, 29 Sep 2025 07:03:12 +0000 (12:33 +0530)

committer GitHub <redacted>

Mon, 29 Sep 2025 07:03:12 +0000 (10:03 +0300)
author Vinkal <redacted>
Mon, 29 Sep 2025 07:03:12 +0000 (12:33 +0530)
committer GitHub <redacted>
Mon, 29 Sep 2025 07:03:12 +0000 (10:03 +0300)
diff --git a/tools/main/main.cpp b/tools/main/main.cpp

index 083fc0cf26c932de636d4a43aec146cc7525a90d..498e00e3a5e58deb9e2622f1f6165e9ad0f1c375 100644 (file)
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -707,6 +707,10 @@ int main(int argc, char ** argv) {
  
              embd.push_back(id);
  
+            if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
+                assistant_ss << common_token_to_piece(ctx, id, false);
+            }
+
              // echo this to console
              input_echo = true;
  
@@ -824,11 +828,7 @@ int main(int argc, char ** argv) {
                  }
              }
  
-            // if current token is not EOG, we add it to current assistant message
              if (params.conversation_mode && !waiting_for_first_input) {
-                const auto id = common_sampler_last(smpl);
-                assistant_ss << common_token_to_piece(ctx, id, false);
-
                  if (!prompt.empty()) {
                      prompt.clear();
                      is_interacting = false;