vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#11496)

author mgroeber9110 <redacted>

Thu, 30 Jan 2025 10:10:59 +0000 (11:10 +0100)

committer GitHub <redacted>

Thu, 30 Jan 2025 10:10:59 +0000 (12:10 +0200)
author mgroeber9110 <redacted>
Thu, 30 Jan 2025 10:10:59 +0000 (11:10 +0100)
committer GitHub <redacted>
Thu, 30 Jan 2025 10:10:59 +0000 (12:10 +0200)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 561f8bdb840106a871d6866effe8d71c2030dd26..ad9ffe66aa749d97ec19ebe7286ed226e63b5575 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
          GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
          linefeed_id = ids[0];
      } else {
-        const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
+        const std::vector<int> ids = tokenize("\n", false);
  
          //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
          if (ids.empty()) {
author	mgroeber9110 <redacted>
	Thu, 30 Jan 2025 10:10:59 +0000 (11:10 +0100)
committer	GitHub <redacted>
	Thu, 30 Jan 2025 10:10:59 +0000 (12:10 +0200)