llama : support more diverse tokenizers? (#2420)

author eric8607242 <redacted>

Fri, 28 Jul 2023 18:10:05 +0000 (02:10 +0800)

committer GitHub <redacted>

Fri, 28 Jul 2023 18:10:05 +0000 (21:10 +0300)
author eric8607242 <redacted>
Fri, 28 Jul 2023 18:10:05 +0000 (02:10 +0800)
committer GitHub <redacted>
Fri, 28 Jul 2023 18:10:05 +0000 (21:10 +0300)
diff --git a/llama.cpp b/llama.cpp

index a4489773f96c5b2f24697df372c6fbd05a1c8f79..a35c690ea9f68ccbe97c0b78652300fe6653c335 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1924,7 +1924,9 @@ struct llama_tokenizer {
              if (token == vocab_.token_to_id.end()) {
                  // output any symbols that did not form tokens as bytes.
                  for (int j = 0; j < (int) symbol.n; ++j) {
-                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    // NOTE: old version, before #2420 - not sure what are the implications of this
+                    //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
                      output.push_back(token_id);
                  }
              } else {
author	eric8607242 <redacted>
	Fri, 28 Jul 2023 18:10:05 +0000 (02:10 +0800)
committer	GitHub <redacted>
	Fri, 28 Jul 2023 18:10:05 +0000 (21:10 +0300)