convert : automatically fall back to HfVocab if tokenizer.model doesn't exist (#5821)

author Jared Van Bortel <redacted>

Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)

committer GitHub <redacted>

Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)
author Jared Van Bortel <redacted>
Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)
committer GitHub <redacted>
Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)
diff --git a/README.md b/README.md

index 67717c1e38b34719d9d2fed726908821b89dd420..9396467537c16ad7a255700dd560f7e61d20ee0f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -786,7 +786,7 @@ And after 4.45 hours, you will have the final perplexity.
  ### Interactive mode
  
  If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
  
  Here is an example of a few-shot interaction, invoked with the command
  
@@ -850,7 +850,7 @@ Sample run:
  ```
  == Running in interactive mode. ==
   - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMa.
+ - Press Return to return control to LLaMA.
   - If you want to submit another line, end your input in '\'.
  
   Below is an instruction that describes a task. Write a response that appropriately completes the request.
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py

index b331080629be0995bc394679bc31a7378518a83d..cd9644fcb52133ed8dc5fb4a26a0d0584ed038a8 100755 (executable)
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
          raise ValueError('Unable to load metadata')
      vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
      vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
      convert.check_vocab_size(params, vocab)
      return params, vocab, special_vocab
  
@@ -398,8 +398,8 @@ def handle_args():
                          help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
      parser.add_argument("--vocab-dir", type=Path,
                          help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+    parser.add_argument("--vocabtype", default="spm,hfft",
+                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
      return parser.parse_args()
  
  
diff --git a/convert.py b/convert.py

index 63a0a5d78075b3b479440a33a0b9dc290b3a65f8..6e3a0319b1e462e5aed2042ce14fbce4824246ff 100755 (executable)
--- a/convert.py
+++ b/convert.py
@@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
  
  
  class VocabFactory:
+    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
+
      def __init__(self, path: Path):
          self.path = path
-        self.files: dict[str, Path | None] = {
-            "tokenizer.model": None,
-            "vocab.json": None,
-            "tokenizer.json": None,
-        }
-        self._detect_files()
-
-    def _detect_files(self):
-        for file in self.files.keys():
-            file_path = self.path / file
-            parent_file_path = self.path.parent / file
-            if file_path.exists():
-                self.files[file] = file_path
-            elif parent_file_path.exists():
-                self.files[file] = parent_file_path
-        print(f"Found vocab files: {self.files}")
-
-    def _select_file(self, vocabtype: str | None) -> Path:
-        if vocabtype in ["spm", "bpe"]:
-            for file_key in self.files.keys():
-                if (file := self.files[file_key]) is not None:
-                    return file
-            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        if vocabtype == "hfft":
-            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
-            return self.path
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        self.file_paths = self._detect_files()
+        print(f"Found vocab files: {self.file_paths}")
+
+    def _detect_files(self) -> dict[str, Path | None]:
+        def locate(file: str) -> Path | None:
+            if (path := self.path / file).exists():
+                return path
+            if (path := self.path.parent / file).exists():
+                return path
+            return None
+
+        return {vt: locate(f) for vt, f in self._FILES.items()}
+
+    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
+        for vtype in vocab_types:
+            try:
+                path = self.file_paths[vtype]
+            except KeyError:
+                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
+            if path is not None:
+                return vtype, path
+        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
  
      def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
          load_merges = vocabtype == "bpe"
@@ -1322,30 +1319,30 @@ class VocabFactory:
              n_vocab=n_vocab,
          )
  
-    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
-        path = self._select_file(vocabtype)
-        print(f"Loading vocab file '{path}', type '{vocabtype}'")
+    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        vocab_type, path = self._select_file(vocab_types)
+        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
  
          added_tokens_path = path.parent / "added_tokens.json"
          vocab: Vocab
-        if vocabtype == "bpe":
+        if vocab_type == "bpe":
              vocab = BpeVocab(
                  path, added_tokens_path if added_tokens_path.exists() else None
              )
-        elif vocabtype == "spm":
+        elif vocab_type == "spm":
              vocab = SentencePieceVocab(
                  path, added_tokens_path if added_tokens_path.exists() else None
              )
-        elif vocabtype == "hfft":
+        elif vocab_type == "hfft":
              vocab = HfVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
+                path.parent, added_tokens_path if added_tokens_path.exists() else None
              )
          else:
-            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+            raise ValueError(vocab_type)
          # FIXME: Respect --vocab-dir?
          special_vocab = self._create_special_vocab(
              vocab,
-            vocabtype,
+            vocab_type,
              model_parent_path,
          )
          return vocab, special_vocab
@@ -1379,15 +1376,14 @@ def main(args_in: list[str] | None = None) -> None:
      if np.uint32(1) == np.uint32(1).newbyteorder("<"):
          # We currently only support Q8_0 output on little endian systems.
          output_choices.append("q8_0")
-    vocab_types = ["spm", "bpe", "hfft"]
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
      parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
      parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
      parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
      parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
      parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
      parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
      parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
      parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
      parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
@@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None:
      model_parent_path = model_plus.paths[0].parent
      vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
      vocab_factory = VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
+    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
  
      if args.vocab_only:
          if not args.outfile:
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp

index d4b8729dd0283c2e0ece17d9022c22bc967054ca..91c39c5ae42e35dd4ec6675108470a184398a9a3 100644 (file)
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
      if (params.interactive) {
          const char *control_message;
          if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
                                " - To return control without starting a new line, end your input with '/'.\n";
          } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to LLaMA.\n"
                                " - To return control without starting a new line, end your input with '/'.\n"
                                " - If you want to submit another line, end your input with '\\'.\n";
          }
author	Jared Van Bortel <redacted>
	Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)
committer	GitHub <redacted>
	Sat, 2 Mar 2024 17:27:26 +0000 (12:27 -0500)
README.md		patch \| blob \| history
convert-llama-ggml-to-gguf.py		patch \| blob \| history
convert.py		patch \| blob \| history
examples/infill/infill.cpp		patch \| blob \| history