// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+ try {
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+ } catch (const std::exception & e) {
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+ vocab.linefeed_id = vocab.special_pad_id;
+ }
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.linefeed_id = vocab.special_pad_id;
} else {
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
- return vocab.token_to_id.at(buf);
+ auto token = vocab.token_to_id.find(buf);
+ if (token != vocab.token_to_id.end()) {
+ return (*token).second;
+ }
+ // Try to fall back to just the byte as a string
+ const char buf2[2] = { (char)ch, 0 };
+ return vocab.token_to_id.at(buf2);
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {