"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
+ // Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
+ // normalizer, then BPE merges run on the whole text without
+ // word-level pre-splitting. We only need to split on newlines
+ // since BPE merge lookup asserts no newlines in tokens.
+ regex_exprs = {
+ "[^\\n]+|[\\n]+",
+ };
+ byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
+ break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
}
std::vector<std::string> regex_exprs;
+ bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
};
struct llm_tokenizer_bpe_session {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
- const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+ const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
symbols_final.clear();
+ auto tok_pre = vocab.get_pre_type();
for (const auto & word : word_collection) {
work_queue = llm_bigram_bpe::queue();
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
+ } else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
+ // fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
+ auto tok = vocab.text_to_token(word);
+ if (tok != LLAMA_TOKEN_NULL) {
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+ offset = word.size();
+ }
}
while (offset < word.size()) {
special_pad_id = 3; // <|plamo:pad|>
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "gemma4") {
- type = LLAMA_VOCAB_TYPE_SPM;
+ type = LLAMA_VOCAB_TYPE_BPE;
+
+ // read bpe merges and populate bpe ranks
+ const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+ if (merges_keyidx == -1) {
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
+ }
+ {
+ const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+ for (int i = 0; i < n_merges; i++) {
+ const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+
+ std::string first;
+ std::string second;
+
+ const size_t pos = word.find(' ', 1);
+
+ if (pos != std::string::npos) {
+ first = word.substr(0, pos);
+ second = word.substr(pos + 1);
+ }
+
+ bpe_ranks.emplace(std::make_pair(first, second), i);
+ }
+ }
// default special tokens (to be read from GGUF)
special_bos_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
- tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ tokenizer_pre = "gemma4";
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
// for now, only BPE models have pre-tokenizers
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
+ escape_whitespaces = false;
clean_spaces = true;
if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
} else if (
tokenizer_pre == "jais-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
+ } else if (
+ tokenizer_pre == "gemma4") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
+ escape_whitespaces = true;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+ if (escape_whitespaces) {
+ llama_escape_whitespace(text);
+ }
+
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+ if (escape_whitespaces) {
+ // SPM-style BPE: tokens contain ▁ for spaces
+ std::string result = token_text;
+ llama_unescape_whitespace(result);
+ return _try_copy(result.data(), result.size());
+ }
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}