]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
convert : fix nomic-bert-moe mask token (#13757)
authorSigbjørn Skjæret <redacted>
Sun, 1 Jun 2025 16:07:21 +0000 (18:07 +0200)
committerGitHub <redacted>
Sun, 1 Jun 2025 16:07:21 +0000 (18:07 +0200)
convert_hf_to_gguf.py
src/llama-vocab.cpp

index 42e8f9cc06e293a52cda118e720fff80475d69a9..ec3b5697d8f6f67cf5a8c9aef67a2760c9c46ee3 100755 (executable)
@@ -3889,6 +3889,12 @@ class BertModel(TextModel):
                 SentencePieceTokenTypes.UNKNOWN,
             ] + toktypes[3:-1]
 
+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL
+
         self.gguf_writer.add_tokenizer_model("t5")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
index d5a036a8c4413cb91c6d242ec2f049ced6b3cc62..b51976699ca7beaa19d908d8e58cab798afc7bdc 100644 (file)
@@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
         std::string model_name;
         std::string tokenizer_pre;
+        std::string general_arch;
 
         ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
         ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+        ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
 
         // model name to lowercase
         std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         );
 
-        // set attributes by model/tokenizer name
-        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+        // set attributes by model/tokenizer/architecture name
+        if (false
+                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
+                || _contains_any(general_arch, {"nomic-bert-moe"})
+           ) {
             _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
         } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
             for (auto id : cache_special_tokens) {