]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
convert : correctly handle LLaMA tokenizer for Jamba (#16470)
authoramirai21 <redacted>
Sat, 11 Oct 2025 08:33:41 +0000 (11:33 +0300)
committerGitHub <redacted>
Sat, 11 Oct 2025 08:33:41 +0000 (10:33 +0200)
* fix: convert_hf_to_gguf - change Jamba non-sentencepiece mode (tokenizer.json) vocab construction

* fix: convert_hf_to_gguf - jamba non-sentencepiece tokenizer to use _set_vocab_llama_hf func

* fix: convert_hf_to_gguf - removed get_vocab_base_pre from jamba

convert_hf_to_gguf.py

index 43d345bcb480c9266e4b101b74153a4e694d5de5..8c5132193e0e0318c982832f21562cce52e96175 100755 (executable)
@@ -5966,20 +5966,12 @@ class Mamba2Model(TextModel):
 class JambaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.JAMBA
 
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        del tokenizer  # unused
-
-        return "gpt-2"
-
     def set_vocab(self):
         if (self.dir_model / "tokenizer.model").is_file():
-            # Using Jamba's tokenizer.json causes errors on model load
-            # (something about "byte not found in vocab"),
-            # but there's a working tokenizer.model
             self._set_vocab_sentencepiece()
         else:
-            # Some Jamba models only have a tokenizer.json, which works.
-            self._set_vocab_gpt2()
+            self._set_vocab_llama_hf()
+            self.gguf_writer.add_add_space_prefix(False)
 
     def set_gguf_parameters(self):
         d_model = self.find_hparam(["hidden_size", "mamba_d_model"])