From: Christian Fillion Date: Fri, 7 Feb 2025 13:55:47 +0000 (-0500) Subject: vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729) X-Git-Tag: upstream/0.0.4719~54 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=2d219b389e8c8c40bce547b08c8aa7add60fde1f;p=pkg%2Fggml%2Fsources%2Fllama.cpp vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729) Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning. --- diff --git a/src/unicode.cpp b/src/unicode.cpp index 89180da4..a32ae6d0 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -618,7 +618,14 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { - result.push_back(unicode_cpt_from_utf8(utf8, offset)); + try { + result.push_back(unicode_cpt_from_utf8(utf8, offset)); + } + catch (const std::invalid_argument & /*ex*/) { + // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize + ++offset; + result.emplace_back(0xFFFD); // replacement character + } } return result; }