llama : remove deprecated codecvt (#19565)

author Adrien Gallouët <redacted>

Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)

committer GitHub <redacted>

Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)
author Adrien Gallouët <redacted>
Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)
committer GitHub <redacted>
Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)
diff --git a/src/unicode.cpp b/src/unicode.cpp

index adfc489d1f033b42d0ee42ce2e65df06927550fb..b88d953bd2714014c363c3a6c141f86ad1fc7b0f 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -1,16 +1,10 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
  #include "unicode.h"
  #include "unicode-data.h"
  
  #include <algorithm>
  #include <cassert>
-#include <codecvt>
  #include <cstddef>
  #include <cstdint>
-#include <locale>
  #include <map>
  #include <regex>
  #include <stdexcept>
@@ -199,27 +193,6 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
      return map;
  }
  
-static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
-#if defined(__clang__)
-    // disable C++17 deprecation warning for std::codecvt_utf8
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-
-    return conv.from_bytes(s);
-}
-
  static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
      std::vector<std::string> bpe_encoded_words;
      for (const auto & word : bpe_words) {
@@ -1028,10 +1001,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                      break;
                  }
              }
+            const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
  
              if (use_collapsed) {
                  // sanity-check that the original regex does not contain any non-ASCII characters
-                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
                  for (size_t i = 0; i < cpts_regex.size(); ++i) {
                      if (cpts_regex[i] >= 128) {
                          throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
@@ -1087,7 +1060,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                  bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
              } else {
                  // no unicode category used, we can use std::wregex directly
-                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+                std::wstring wregex_expr(cpts_regex.begin(), cpts_regex.end());
  
                  // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                  std::wstring wtext(cpts.begin(), cpts.end());
author	Adrien Gallouët <redacted>
	Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)
committer	GitHub <redacted>
	Fri, 13 Feb 2026 05:43:53 +0000 (06:43 +0100)