else:
_mistral_common_installed = True
+try:
+ from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+ get_one_valid_tokenizer_file,
+ )
+except ImportError:
+ # We still want the conversion to work with older mistral-common versions.
+ get_one_valid_tokenizer_file = None
+
import gguf
# Find the tokenizer files
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
- valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
-
- if len(valid_tokenizer_files) == 0:
- raise ValueError(f"No tokenizer file found in the directory: {base_path}")
- # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
- if len(valid_tokenizer_files) > 1:
- if "tekken.json" in valid_tokenizer_files:
- tokenizer_file = "tekken.json"
- else:
- tokenizer_file = sorted(valid_tokenizer_files)[-1]
- logger.warning(
- f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
- )
+
+ if get_one_valid_tokenizer_file is not None:
+ tokenizer_file_path = get_one_valid_tokenizer_file(all_files)
else:
- tokenizer_file = valid_tokenizer_files[0]
+ valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
+
+ if len(valid_tokenizer_files) == 0:
+ raise ValueError(f"No tokenizer file found in the directory: {base_path}")
+ # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
+ if len(valid_tokenizer_files) > 1:
+ if "tekken.json" in valid_tokenizer_files:
+ tokenizer_file = "tekken.json"
+ else:
+ tokenizer_file = sorted(valid_tokenizer_files)[-1]
+ logger.warning(
+ f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
+ )
+ else:
+ tokenizer_file = valid_tokenizer_files[0]
+
+ tokenizer_file_path = base_path / tokenizer_file
self.tokenizer = MistralTokenizer.from_file(
- base_path / tokenizer_file
+ tokenizer_file_path
).instruct_tokenizer.tokenizer
self.tokenizer_type = (
MistralTokenizerType.tekken
else MistralTokenizerType.spm
)
self.vocab_size = self.tokenizer.n_words
- self.fname_tokenizer = base_path / tokenizer_file
+ self.fname_tokenizer = tokenizer_file_path
self._name = (
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
)