tokenize : escape the prompt (#11058)

author Georgi Gerganov <redacted>

Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)

committer GitHub <redacted>

Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)
author Georgi Gerganov <redacted>
Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)
committer GitHub <redacted>
Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp

index c97e227242c423e3c0f865a7ec4650ea9b474271..57d9d431241840d9a1f228488db6746e26c26829 100644 (file)
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) {
      printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
      printf("    --stdin                              read prompt from standard input.\n");
      printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-escape                          do not escape input (such as \\n, \\t, etc.).\n");
      printf("    --no-parse-special                   do not parse control tokens.\n");
      printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
      printf("    --show-count                         print the total number of tokens.\n");
@@ -198,6 +199,7 @@ int main(int raw_argc, char ** raw_argv) {
      // variables where to put any arguments we see.
      bool printing_ids = false;
      bool no_bos = false;
+    bool no_escape = false;
      bool no_parse_special = false;
      bool disable_logging = false;
      bool show_token_count = false;
@@ -233,6 +235,9 @@ int main(int raw_argc, char ** raw_argv) {
          else if (arg == "--no-bos") {
              no_bos = true;
          }
+        else if (arg == "--no-escape") {
+            no_escape = true;
+        }
          else if (arg == "--no-parse-special") {
              no_parse_special = true;
          }
@@ -363,6 +368,11 @@ int main(int raw_argc, char ** raw_argv) {
      const bool model_wants_add_bos = llama_add_bos_token(model);
      const bool add_bos = model_wants_add_bos && !no_bos;
      const bool parse_special = !no_parse_special;
+    const bool escape = !no_escape;
+
+    if (escape) {
+        string_process_escapes(prompt);
+    }
  
      std::vector<llama_token> tokens;
      tokens = common_tokenize(model, prompt, add_bos, parse_special);
author	Georgi Gerganov <redacted>
	Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)
committer	GitHub <redacted>
	Mon, 6 Jan 2025 08:54:25 +0000 (10:54 +0200)