tokenize : add --show-count (token) option (#8299)

author Daniel Bevenius <redacted>

Thu, 4 Jul 2024 16:38:58 +0000 (18:38 +0200)

committer GitHub <redacted>

Thu, 4 Jul 2024 16:38:58 +0000 (19:38 +0300)
author Daniel Bevenius <redacted>
Thu, 4 Jul 2024 16:38:58 +0000 (18:38 +0200)
committer GitHub <redacted>
Thu, 4 Jul 2024 16:38:58 +0000 (19:38 +0300)
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp

index 54c9834afb1b930eea0c3a50928d586b3a10c5db..0180c87d863bfe699cb9bac36dedbb41d566b599 100644 (file)
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -30,6 +30,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
      fprintf(stream, "    --stdin                              read prompt from standard input.\n");
      fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
      fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
  }
  
  static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
      bool printing_ids = false;
      bool no_bos = false;
      bool disable_logging = false;
+    bool show_token_count = false;
      const char * model_path = NULL;
      const char * prompt_path = NULL;
      const char * prompt_arg = NULL;
@@ -249,6 +251,9 @@ int main(int raw_argc, char ** raw_argv) {
          else if (arg == "--log-disable") {
              disable_logging = true;
          }
+        else if (arg == "--show-count") {
+            show_token_count = true;
+        }
          else {
              fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
              return 1;
@@ -384,6 +389,9 @@ int main(int raw_argc, char ** raw_argv) {
          printf("]\n");
      }
  
+    if (show_token_count) {
+        printf("Total number of tokens: %ld\n", tokens.size());
+    }
      // silence valgrind
      llama_free(ctx);
      llama_free_model(model);
author	Daniel Bevenius <redacted>
	Thu, 4 Jul 2024 16:38:58 +0000 (18:38 +0200)
committer	GitHub <redacted>
	Thu, 4 Jul 2024 16:38:58 +0000 (19:38 +0300)