main : don't print special tokens with --grammar (#6923)

author Justine Tunney <redacted>

Sat, 25 May 2024 09:04:03 +0000 (05:04 -0400)

committer GitHub <redacted>

Sat, 25 May 2024 09:04:03 +0000 (19:04 +1000)
author Justine Tunney <redacted>
Sat, 25 May 2024 09:04:03 +0000 (05:04 -0400)
committer GitHub <redacted>
Sat, 25 May 2024 09:04:03 +0000 (19:04 +1000)
diff --git a/common/common.cpp b/common/common.cpp

index c6459038560f137a56c5413ac9893b0dbab29faa..781f2166bb66aab9e384bd82b78d5198d5839caf 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
          params.interactive_specials = true;
          return true;
      }
+    if (arg == "--no-special") {
+        params.no_special = true;
+        return true;
+    }
      if (arg == "--embedding") {
          params.embedding = true;
          return true;
@@ -1364,6 +1368,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      printf("  -i, --interactive     run in interactive mode\n");
      printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
      printf("  --interactive-first   run in interactive mode and wait for input right away\n");
+    printf("  --no-special          control tokens output disabled\n");
      printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
      printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
      printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
diff --git a/common/common.h b/common/common.h

index f68f3c2979b94b3c9995df8fef007854d824e0df..5388f6b68973c88e5c5f3fbb16e150626eb1e18b 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -146,6 +146,7 @@ struct gpt_params {
      bool use_color         = false; // use color to distinguish generations and inputs
      bool interactive       = false; // interactive mode
      bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
+    bool no_special        = false; // disable control token output
      bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
      bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
      bool prompt_cache_all  = false; // save user input and generations to prompt cache
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 09fa85fce0ee35864b757d56a50eb5b8c202058e..ac35772f1e13354a2d15aeeb01bfec5f1b141bab 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -740,18 +740,32 @@ int main(int argc, char ** argv) {
          // display text
          if (input_echo && display) {
              for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
-                printf("%s", token_str.c_str());
+                const std::string token_str = llama_token_to_piece(ctx, id);
+
+                // Console/Stream Output
+                if (!llama_token_is_control(llama_get_model(ctx), id)) {
+                    // Stream Output Token To Standard Output
+                    fprintf(stdout, "%s", token_str.c_str());
+                } else if (!params.no_special && !params.conversation) {
+                    // Stream Control Token To Standard Output Stream
+                    fprintf(stdout, "%s", token_str.c_str());
+                }
  
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
                  if (embd.size() > 1) {
+                    // Incoming Requested Tokens
                      input_tokens.push_back(id);
                  } else {
+                    // Outgoing Generated Tokens
                      output_tokens.push_back(id);
                      output_ss << token_str;
                  }
+
+                fflush(stdout);
              }
-            fflush(stdout);
          }
+
          // reset color to default if there is no pending user input
          if (input_echo && (int) embd_inp.size() == n_consumed) {
              console::set_display(console::reset);
diff --git a/llama.cpp b/llama.cpp

index 85cb3140d945b5bf26d68589548d943be0e49c77..989d27b9dfb3a3982fc15fde14936fa1e4621bb2 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -17861,6 +17861,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
      );
  }
  
+bool llama_token_is_control(const struct llama_model * model, llama_token token) {
+    return llama_is_control_token(model->vocab, token);
+}
+
  llama_token llama_token_bos(const struct llama_model * model) {
      return model->vocab.special_bos_id;
  }
diff --git a/llama.h b/llama.h

index 16cece5db0e78f55b56ac0ab87cd4c67ff1e77b3..16676269dd38ad2f1fb6206eef331b486bbf36d0 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -823,6 +823,9 @@ extern "C" {
      // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
      LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
  
+    // Identify if Token Id is a control token or a render-able token
+    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+
      // Special tokens
      LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
      LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
author	Justine Tunney <redacted>
	Sat, 25 May 2024 09:04:03 +0000 (05:04 -0400)
committer	GitHub <redacted>
	Sat, 25 May 2024 09:04:03 +0000 (19:04 +1000)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history