llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478)

author Aarni Koskela <redacted>

Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)

committer GitHub <redacted>

Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)
author Aarni Koskela <redacted>
Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)
committer GitHub <redacted>
Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)
diff --git a/llama.cpp b/llama.cpp

index 61c695187def84d091b699a85083f7222b76e8a4..8ebbf7628c1e4a1f676a265dc4bcc4e9ae987bea 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -3314,7 +3314,12 @@ static void llm_load_vocab(
  
      // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
      if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        try {
+            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            vocab.linefeed_id = vocab.special_pad_id;
+        }
      } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
          vocab.linefeed_id = vocab.special_pad_id;
      } else {
@@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
      switch (llama_vocab_get_type(vocab)) {
          case LLAMA_VOCAB_TYPE_SPM: {
              const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-            return vocab.token_to_id.at(buf);
+            auto token = vocab.token_to_id.find(buf);
+            if (token != vocab.token_to_id.end()) {
+                return (*token).second;
+            }
+            // Try to fall back to just the byte as a string
+            const char buf2[2] = { (char)ch, 0 };
+            return vocab.token_to_id.at(buf2);
          }
          case LLAMA_VOCAB_TYPE_WPM:
          case LLAMA_VOCAB_TYPE_BPE: {
author	Aarni Koskela <redacted>
	Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)
committer	GitHub <redacted>
	Tue, 13 Feb 2024 16:18:16 +0000 (18:18 +0200)