unicode : MSVC regex fix (#19340)

author Lasse Lauwerys <redacted>

Fri, 6 Feb 2026 13:56:13 +0000 (14:56 +0100)

committer GitHub <redacted>

Fri, 6 Feb 2026 13:56:13 +0000 (15:56 +0200)
author Lasse Lauwerys <redacted>
Fri, 6 Feb 2026 13:56:13 +0000 (14:56 +0100)
committer GitHub <redacted>
Fri, 6 Feb 2026 13:56:13 +0000 (15:56 +0200)
diff --git a/src/unicode.cpp b/src/unicode.cpp

index b47dcbe6198a82cd42ab171925aa1200eb1603e8..adfc489d1f033b42d0ee42ce2e65df06927550fb 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -497,49 +497,26 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
      return bpe_offsets;
  }
  
-// use std::wregex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
-    for (auto offset : offsets) {
-        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
-        std::wcregex_iterator end;
-
-        int64_t start_idx = 0;
-        while (it != end) {
-            std::wcmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
-            }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
-            ++it;
-        }
-
-        if (start_idx < (int64_t) offset) {
-            bpe_offsets.emplace_back(offset - start_idx);
-        }
-        start += offset;
-    }
-
-    return bpe_offsets;
-}
-
-// use std::regex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
+template <typename CharT>
+static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
+    using BidirIt = typename std::basic_string<CharT>::const_iterator;
+#ifdef _MSC_VER
+    // Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830
+    constexpr auto regex_flags = std::regex_constants::ECMAScript;
+#else
+    constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
+#endif
+    std::basic_regex<CharT> expr(regex, regex_flags);
      std::vector<size_t> bpe_offsets; // store the offset of each word
      bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
      size_t start = 0;
      for (auto offset : offsets) {
-        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
-        std::cregex_iterator end;
+        std::regex_iterator<BidirIt> it(text.begin() + start, text.begin() + start + offset, expr);
+        std::regex_iterator<BidirIt> end;
  
          int64_t start_idx = 0;
          while (it != end) {
-            std::cmatch match = *it;
+            std::match_results<BidirIt> match = *it;
              if (match.position() > start_idx) {
                  bpe_offsets.emplace_back(match.position() - start_idx);
              }
author	Lasse Lauwerys <redacted>
	Fri, 6 Feb 2026 13:56:13 +0000 (14:56 +0100)
committer	GitHub <redacted>
	Fri, 6 Feb 2026 13:56:13 +0000 (15:56 +0200)