mpt : do not duplicate token_embd.weight on disk (#5670)

author Jared Van Bortel <redacted>

Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)

committer GitHub <redacted>

Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)
author Jared Van Bortel <redacted>
Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)
committer GitHub <redacted>
Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index 481198dad042c7cc84d3a9a95b7874220b2fdc57..9bdfce07ab7dbdd79e90bc7637ca0c3353689f11 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -622,11 +622,6 @@ class MPTModel(Model):
  
              self.gguf_writer.add_tensor(new_name, data)
  
-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-
  
  class OrionModel(Model):
      def set_vocab(self):
diff --git a/llama.cpp b/llama.cpp

index 2ebd40df234f0aacc6c126c18df979a2a138ad06..37477e6ef3c44dc394fea568bcdccc15ebbe7c9d 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
          {
              { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
              { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
              { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
              { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
              { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
                          model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                          model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
  
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                      }
  
                      for (int i = 0; i < n_layer; ++i) {
author	Jared Van Bortel <redacted>
	Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)
committer	GitHub <redacted>
	Thu, 22 Feb 2024 22:05:23 +0000 (17:05 -0500)
convert-hf-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history