From: mgroeber9110 Date: Thu, 30 Jan 2025 10:10:59 +0000 (+0100) Subject: vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#11496) X-Git-Tag: upstream/0.0.4631~37 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=ffd0821c57edc7e5d04338ab0c6b1461198df15f;p=pkg%2Fggml%2Fsources%2Fllama.cpp vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#11496) --- diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 561f8bdb..ad9ffe66 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); linefeed_id = ids[0]; } else { - const std::vector ids = tokenize("\xC4\x8A", false); // U+010A + const std::vector ids = tokenize("\n", false); //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); if (ids.empty()) {