fix: prevent segfault in tokenizer on highly repetitive input (#17786)

author Pascal <redacted>

Fri, 5 Dec 2025 11:52:23 +0000 (12:52 +0100)

committer GitHub <redacted>

Fri, 5 Dec 2025 11:52:23 +0000 (13:52 +0200)
author Pascal <redacted>
Fri, 5 Dec 2025 11:52:23 +0000 (12:52 +0100)
committer GitHub <redacted>
Fri, 5 Dec 2025 11:52:23 +0000 (13:52 +0200)
diff --git a/src/unicode.cpp b/src/unicode.cpp

index 77ba4fc46bc1170e52ce18370d3db3f46fc8d070..bb44edfaddffdbf159c26629a815fa8c7a4d8cd8 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
  
  // use std::wregex to split the text
  static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
+    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
      std::vector<size_t> bpe_offsets; // store the offset of each word
      bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
      size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
  
  // use std::regex to split the text
  static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
      std::vector<size_t> bpe_offsets; // store the offset of each word
      bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
      size_t start = 0;
author	Pascal <redacted>
	Fri, 5 Dec 2025 11:52:23 +0000 (12:52 +0100)
committer	GitHub <redacted>
	Fri, 5 Dec 2025 11:52:23 +0000 (13:52 +0200)