From: opparco Date: Sun, 3 Sep 2023 10:18:09 +0000 (+0900) Subject: llama : fix bpe tokenize from byte (#2889) X-Git-Tag: gguf-v0.4.0~141 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=37301347767d555d0a66c043ce4ef6ead8e61c55;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : fix bpe tokenize from byte (#2889) --- diff --git a/llama.cpp b/llama.cpp index 2b0cf30f..c97c1462 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); if (token_multibyte == vocab.token_to_id.end()) { - fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + try { + llama_token token_byte = llama_byte_to_token(vocab, *j); + output.push_back(token_byte); + } catch (const std::out_of_range & err) { + fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + } + } else { + output.push_back((*token_multibyte).second); } - output.push_back((*token_multibyte).second); } } else { output.push_back((*token).second);