SentencePieceTokenTypes.UNKNOWN,
] + toktypes[3:-1]
+ if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+ # Add mask token missing from sentencepiece.bpe.model
+ tokens[250001] = b'<mask>'
+ scores[250001] = 0.0
+ toktypes[250001] = SentencePieceTokenTypes.CONTROL
+
self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
std::string model_name;
std::string tokenizer_pre;
+ std::string general_arch;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
}
);
- // set attributes by model/tokenizer name
- if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+ // set attributes by model/tokenizer/architecture name
+ if (false
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
+ || _contains_any(general_arch, {"nomic-bert-moe"})
+ ) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : cache_special_tokens) {