From: compilade Date: Thu, 11 Jul 2024 07:41:48 +0000 (-0400) Subject: tokenize : add --no-parse-special option (#8423) X-Git-Tag: upstream/0.0.4488~1116 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=9a55ffe6fbf7f19c865f8f277ca7cd585cc0b094;p=pkg%2Fggml%2Fsources%2Fllama.cpp tokenize : add --no-parse-special option (#8423) This should allow more easily explaining how parse_special affects tokenization. --- diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 0180c87d..2afb6024 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) { fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); fprintf(stream, " --stdin read prompt from standard input.\n"); fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + fprintf(stream, " --no-parse-special do not parse control tokens.\n"); fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); fprintf(stream, " --show-count print the total number of tokens.\n"); } @@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) { // variables where to put any arguments we see. bool printing_ids = false; bool no_bos = false; + bool no_parse_special = false; bool disable_logging = false; bool show_token_count = false; const char * model_path = NULL; @@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) { else if (arg == "--no-bos") { no_bos = true; } + else if (arg == "--no-parse-special") { + no_parse_special = true; + } else if (arg == "-p" || arg == "--prompt") { if (prompt_set) { fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); @@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) { const bool model_wants_add_bos = llama_should_add_bos_token(model); const bool add_bos = model_wants_add_bos && !no_bos; + const bool parse_special = !no_parse_special; std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, true); + tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); if (printing_ids) { printf("[");