From: Christian Fillion <redacted>
Date: Fri, 7 Feb 2025 13:55:47 +0000 (-0500)
Subject: vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729)
X-Git-Tag: upstream/0.0.4719~54
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=2d219b389e8c8c40bce547b08c8aa7add60fde1f;p=pkg%2Fggml%2Fsources%2Fllama.cpp

vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729)

Silently insert U+FFFD(s) (Unicode replacement character) instead until the
next valid codepoint can be found.

This fixes `llama_tokenize` throwing an exception across the C API boundary
or libllama's module boundary (the caller's runtime might be incompatible!)

Returing a proper error code might be desirable, however the signature
of `llama_tokenize` doesn't allow it as all return values already have
existing meaning.
---

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 89180da41..a32ae6d08 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     result.reserve(utf8.size());
     size_t offset = 0;
     while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
     }
     return result;
 }