vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729)

author Christian Fillion <redacted>

Fri, 7 Feb 2025 13:55:47 +0000 (08:55 -0500)

committer GitHub <redacted>

Fri, 7 Feb 2025 13:55:47 +0000 (15:55 +0200)
author Christian Fillion <redacted>
Fri, 7 Feb 2025 13:55:47 +0000 (08:55 -0500)
committer GitHub <redacted>
Fri, 7 Feb 2025 13:55:47 +0000 (15:55 +0200)
diff --git a/src/unicode.cpp b/src/unicode.cpp

index 89180da4152da5f73047c94ac23268937db99ff3..a32ae6d0824f2e3718a4d5483d585781dc386544 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
      result.reserve(utf8.size());
      size_t offset = 0;
      while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
      }
      return result;
  }
author	Christian Fillion <redacted>
	Fri, 7 Feb 2025 13:55:47 +0000 (08:55 -0500)
committer	GitHub <redacted>
	Fri, 7 Feb 2025 13:55:47 +0000 (15:55 +0200)