vocab: fix Gemma4 tokenizer (#21343)

author Piotr Wilkin (ilintar) <redacted>

Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)

committer GitHub <redacted>

Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)
author Piotr Wilkin (ilintar) <redacted>
Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)
committer GitHub <redacted>
Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index de1def320167017d59b01881564acf246b4923ff..d4929d6b6f852894f9313fd69e66239c46984f73 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7464,9 +7464,6 @@ class Gemma4Model(Gemma3Model):
  
          assert len(tokens) == vocab.vocab_size
  
-        # TODO @ngxson : there are some known (rare) issues with the tokenizer during development
-        # but I don't have time to dive into them right now;
-        # using a dedicated tokenizer name so that we can fix later without re-converting GGUF
          self.gguf_writer.add_tokenizer_model("gemma4")
          self.gguf_writer.add_token_list(tokens)
          self.gguf_writer.add_token_scores(scores)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index bce9d837c7ab8a3b48822655c4b32f525783fdba..5bce88aab40849f1a81c468fc3857df47838796d 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                      "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                  };
                  break;
+            case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
+                // Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
+                // normalizer, then BPE merges run on the whole text without
+                // word-level pre-splitting. We only need to split on newlines
+                // since BPE merge lookup asserts no newlines in tokens.
+                regex_exprs = {
+                    "[^\\n]+|[\\n]+",
+                };
+                byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
+                break;
              default:
                  // default regex for BPE tokenization pre-processing
                  regex_exprs = {
@@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
      }
  
      std::vector<std::string> regex_exprs;
+    bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
  };
  
  struct llm_tokenizer_bpe_session {
@@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {
  
      void tokenize(const std::string & text, std::vector<llama_token> & output) {
          int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
  
          symbols_final.clear();
+        auto tok_pre = vocab.get_pre_type();
  
          for (const auto & word : word_collection) {
              work_queue = llm_bigram_bpe::queue();
@@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
              if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
                  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
                  offset = word.size();
+            } else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
+                // fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
+                auto tok = vocab.text_to_token(word);
+                if (tok != LLAMA_TOKEN_NULL) {
+                    symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                    offset = word.size();
+                }
              }
  
              while (offset < word.size()) {
@@ -1864,7 +1883,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              special_pad_id = 3;  // <|plamo:pad|>
              special_mask_id = LLAMA_TOKEN_NULL;
          } else if (tokenizer_model == "gemma4") {
-            type = LLAMA_VOCAB_TYPE_SPM;
+            type = LLAMA_VOCAB_TYPE_BPE;
+
+            // read bpe merges and populate bpe ranks
+            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
+            {
+                const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+                for (int i = 0; i < n_merges; i++) {
+                    const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+
+                    std::string first;
+                    std::string second;
+
+                    const size_t pos = word.find(' ', 1);
+
+                    if (pos != std::string::npos) {
+                        first  = word.substr(0, pos);
+                        second = word.substr(pos + 1);
+                    }
+
+                    bpe_ranks.emplace(std::make_pair(first, second), i);
+                }
+            }
  
              // default special tokens (to be read from GGUF)
              special_bos_id  = LLAMA_TOKEN_NULL;
@@ -1874,7 +1917,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              special_pad_id  = LLAMA_TOKEN_NULL;
              special_mask_id = LLAMA_TOKEN_NULL;
  
-            tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_pre = "gemma4";
          } else {
              throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
          }
@@ -1882,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
          // for now, only BPE models have pre-tokenizers
          if (type == LLAMA_VOCAB_TYPE_BPE) {
              add_space_prefix = false;
+            escape_whitespaces = false;
              clean_spaces = true;
              if (tokenizer_pre.empty()) {
                  LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@@ -1948,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              } else if (
                      tokenizer_pre == "jais-2") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
+            } else if (
+                    tokenizer_pre == "gemma4") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
+                escape_whitespaces = true;
              } else if (
                      tokenizer_pre == "jina-v1-en" ||
                      tokenizer_pre == "jina-v2-code" ||
@@ -3045,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
  
+                        if (escape_whitespaces) {
+                            llama_escape_whitespace(text);
+                        }
+
  #ifdef PRETOKENIZERDEBUG
                          LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
  #endif
@@ -3224,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                      return _try_copy(token_text.data(), token_text.size());
                  }
                  if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    if (escape_whitespaces) {
+                        // SPM-style BPE: tokens contain ▁ for spaces
+                        std::string result = token_text;
+                        llama_unescape_whitespace(result);
+                        return _try_copy(result.data(), result.size());
+                    }
                      std::string result = llama_decode_text(token_text);
                      return _try_copy(result.data(), result.size());
                  }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h

index be5b08012df1e38985d79090c0fa005c9b8a7a58..dd38f45d3a2232a9d9777baf5c8e69aaeca2fe1c 100644 (file)
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
      LLAMA_VOCAB_PRE_TYPE_TINY_AYA        = 47,
      LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
      LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
+    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
  };
  
  struct LLM_KV;
diff --git a/src/unicode.cpp b/src/unicode.cpp

index 122c8ca04a511f52eb03b5d51d2d772104ff42c5..c2df90c6d9a41d0317107901c2c78014195a4451 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
      return false;
  }
  
-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
      // unicode categories
      static const std::map<std::string, int> k_ucat_enum = {
          { "\\p{N}", unicode_cpt_flags::NUMBER },
@@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
          start += offset;
      }
  
-    return unicode_byte_encoding_process(bpe_words);
+    if (byte_encode) {
+        return unicode_byte_encoding_process(bpe_words);
+    }
+
+    return bpe_words;
  }
diff --git a/src/unicode.h b/src/unicode.h

index 5bd1362ff41bf76c36dff72973533ac89d14b6d7..600ab9216b94dfac172ed771b3e9c1bdb42f5a2c 100644 (file)
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
  
  bool unicode_cpt_is_han(uint32_t cpt);
  
-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);
author	Piotr Wilkin (ilintar) <redacted>
	Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)
committer	GitHub <redacted>
	Fri, 3 Apr 2026 08:33:03 +0000 (10:33 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history
src/llama-vocab.h		patch \| blob \| history
src/unicode.cpp		patch \| blob \| history
src/unicode.h		patch \| blob \| history