From: eric8607242 Date: Fri, 28 Jul 2023 18:10:05 +0000 (+0800) Subject: llama : support more diverse tokenizers? (#2420) X-Git-Tag: gguf-v0.4.0~382 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=ee1b497c985f61d6ec519c39fcfed78a3c6f1d06;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : support more diverse tokenizers? (#2420) * supporting more diverse tokenizers * Update llama.cpp --------- Co-authored-by: Georgi Gerganov --- diff --git a/llama.cpp b/llama.cpp index a4489773..a35c690e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1924,7 +1924,9 @@ struct llama_tokenizer { if (token == vocab_.token_to_id.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int) symbol.n; ++j) { - llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + // NOTE: old version, before #2420 - not sure what are the implications of this + //llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j])); output.push_back(token_id); } } else {