gguf : add ftype meta info to the model (#2710)

author Georgi Gerganov <redacted>

Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)

committer GitHub <redacted>

Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)
author Georgi Gerganov <redacted>
Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)
committer GitHub <redacted>
Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)
diff --git a/convert.py b/convert.py

index c29c032cd23ab2b3bb469aed2e718177180c60eb..71978d67161836a03538b964a2f6ba24b2b667b9 100644 (file)
--- a/convert.py
+++ b/convert.py
@@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
      'I32': DT_I32,
  }
  
-class GGMLFileType(enum.Enum):
+# TODO: match this with `llama_ftype`
+# TODO: rename to LLAMAFileType
+# TODO: move to `gguf.py`
+class GGMLFileType(enum.IntEnum):
      AllF32    = 0
      MostlyF16 = 1  # except 1d tensors
  
@@ -101,6 +104,8 @@ class Params:
      n_head_kv:  int
      f_norm_eps: float
  
+    ftype: Optional[GGMLFileType] = None
+
      @staticmethod
      def find_n_mult(n_ff: int, n_embd: int) -> int:
          # hardcoded magic range
@@ -738,6 +743,9 @@ class OutputFile:
          self.gguf.add_head_count_kv       (params.n_head_kv)
          self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
  
+        if params.ftype:
+            self.gguf.add_file_type(params.ftype)
+
      def add_meta_vocab(self, vocab: Vocab) -> None:
          tokens = []
          scores = []
@@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
                              " - LLaMA v2: --ctx 4096\n")
          params.n_ctx = args.ctx
  
+    if args.outtype:
+        params.ftype = {
+            "f32": GGMLFileType.AllF32,
+            "f16": GGMLFileType.MostlyF16,
+        }[args.outtype]
+
      print(f"params = {params}")
  
      vocab: Vocab
@@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
              vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
              vocab = load_vocab(vocab_dir, args.vocabtype)
  
-        model       = model_plus.model
-        model       = convert_model_names(model, params)
-        output_type = pick_output_type(model, args.outtype)
-        model       = convert_to_output_type(model, output_type)
-        outfile     = args.outfile or default_outfile(model_plus.paths, output_type)
+        model   = model_plus.model
+        model   = convert_model_names(model, params)
+        ftype   = pick_output_type(model, args.outtype)
+        model   = convert_to_output_type(model, ftype)
+        outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+
+        params.ftype = ftype
+        print(f"Writing {outfile}, format {ftype}")
  
          OutputFile.write_all(outfile, params, model, vocab)
          print(f"Wrote {outfile}")
diff --git a/gguf.py b/gguf.py

index 9776649c761193d4c80fb86893af61d3d857012f..46574671823284936fb2535974c0a365976b9437 100644 (file)
--- a/gguf.py
+++ b/gguf.py
@@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION          = "general.description"
  KEY_GENERAL_LICENSE              = "general.license"
  KEY_GENERAL_SOURCE_URL           = "general.source.url"
  KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+KEY_GENERAL_FILE_TYPE            = "general.file_type"
  
  # LLM
  KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
@@ -595,6 +596,9 @@ class GGUFWriter:
      def add_source_hf_repo(self, repo: str):
          self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
  
+    def add_file_type(self, ftype: int):
+        self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
+
      def add_name(self, name: str):
          self.add_string(KEY_GENERAL_NAME, name)
  
diff --git a/llama.cpp b/llama.cpp

index 0584749c52c9cf240eb8817211d1ed6ae339eb24..6abdc44f2a0625da2593dea04729964579e3853a 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -995,6 +995,16 @@ struct llama_model_loader {
                       } break;
              }
  
+            // this is a way to mark that we have "guessed" the file type
+            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
+
+            {
+                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
+                if (kid >= 0) {
+                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
+                }
+            }
+
              for (int i = 0; i < n_kv; i++) {
                  const char * name         = gguf_get_key(ctx_gguf, i);
                  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1197,7 +1207,11 @@ struct llama_model_loader {
  // load LLaMA models
  //
  
-const char * llama_model_ftype_name(enum llama_ftype ftype) {
+std::string llama_model_ftype_name(enum llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
      switch (ftype) {
          case LLAMA_FTYPE_ALL_F32:     return "all F32";
          case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
@@ -1426,7 +1440,7 @@ static void llama_model_load_internal(
          LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, hparams.rope_freq_base);
          LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, hparams.rope_freq_scale);
          LLAMA_LOG_INFO("%s: model type   = %s\n",     __func__, llama_model_type_name(model.type));
-        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype));
+        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
          LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml->n_elements*1e-9);
  
          // general kv
@@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      // copy the KV pairs from the input file
      gguf_set_kv     (ctx_out, model_loader->ctx_gguf);
      gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
  
  #ifdef GGML_USE_K_QUANTS
      int n_attention_wv    = 0;
@@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) {
  }
  
  int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
+    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
  }
  
  int llama_model_quantize(
diff --git a/llama.h b/llama.h

index aa5b7d69ca81abd130af962607013883db53b3d4..7ce478d5452a74cea0c898ad9bcc3fc20f7a5028 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -103,6 +103,8 @@ extern "C" {
          LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
          LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
          LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+
+        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
      };
  
      typedef struct llama_token_data {
author	Georgi Gerganov <redacted>
	Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)
committer	GitHub <redacted>
	Tue, 22 Aug 2023 17:05:59 +0000 (20:05 +0300)
convert.py		patch \| blob \| history
gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history