llama: add custom newline split for Gemma 4 (#21406)

author Aman Gupta <redacted>

Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)

committer GitHub <redacted>

Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)
author Aman Gupta <redacted>
Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)
committer GitHub <redacted>
Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)
diff --git a/src/unicode.cpp b/src/unicode.cpp

index c2df90c6d9a41d0317107901c2c78014195a4451..506540163fa510f53501221d4f37840da31c7690 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
      return bpe_offsets;
  }
  
+// regex: [^\n]+|[\n]+
+// splits text into runs of non-newline characters and runs of newline characters
+static std::vector<size_t> unicode_regex_split_custom_newlines(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+    bpe_offsets.reserve(offsets.size());
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        size_t pos = offset_ini;
+        while (pos < offset_end) {
+            const bool is_newline = (cpts[pos] == '\n');
+            const size_t run_start = pos;
+            while (pos < offset_end && (cpts[pos] == '\n') == is_newline) {
+                pos++;
+            }
+            bpe_offsets.push_back(pos - run_start);
+        }
+    }
+
+    return bpe_offsets;
+}
+
  static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
      std::vector<size_t> bpe_offsets;
  
@@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
      } else if (regex_expr == "\\p{AFMoE_digits}") {
          // AFMOE digit pattern - use custom implementation for proper splitting
          bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+    } else if (regex_expr == "[^\\n]+|[\\n]+") {
+        bpe_offsets = unicode_regex_split_custom_newlines(text, offsets);
      } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
          // tiny_aya digit grouping pattern from tokenizer.json:
          //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
author	Aman Gupta <redacted>
	Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)
committer	GitHub <redacted>
	Sat, 4 Apr 2026 07:06:34 +0000 (15:06 +0800)