special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
- def _set_vocab_sentencepiece(self):
+ def _set_vocab_sentencepiece(self, add_to_gguf=True):
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
+
+ self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ def _create_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_tokenizer_pre("default")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
+ return tokens, scores, toktypes
def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
model_arch = gguf.MODEL_ARCH.GEMMA2
def set_vocab(self):
- self._set_vocab_llama_hf()
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
+ # hack: This is required so that we can properly use start/end-of-turn for chat template
+ for i in range(108):
+ # including <unusedX>, <start_of_turn>, <end_of_turn>
+ toktypes[i] = SentencePieceTokenTypes.CONTROL
+ self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):