llama : add support for GPT2, Bloom and CodeShell tied word embeddings (#12456)

author Sigbjørn Skjæret <redacted>

Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)

committer GitHub <redacted>

Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)
author Sigbjørn Skjæret <redacted>
Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)
committer GitHub <redacted>
Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 7a2ef4c7e38ce88db69fd878a0f7185d129afd46..7574218e241d46e6c121e6da06c15136b6631697 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -180,7 +180,8 @@ class Model:
              extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
              missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
              if len(extra) == 0 and len(missing_files) > 0:
-                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+                raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
+                                 f"Missing tensors: {missing}")
              else:
                  raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
                                   f"Missing tensors: {missing}\n"
@@ -1099,13 +1100,6 @@ class BloomModel(Model):
  
          tensors.append((self.map_tensor_name(name), data_torch))
  
-        if name == "word_embeddings.weight":
-            assert self.tensor_names is not None
-
-            # TODO: tie them at runtime, don't duplicate in the model file
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
          return tensors
  
  
@@ -2423,10 +2417,6 @@ class GPT2Model(Model):
  
          tensors.append((new_name, data_torch))
  
-        # note: GPT2 output is tied to (same as) wte in original model
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
          return tensors
  
  
@@ -2756,21 +2746,26 @@ class CodeShellModel(Model):
          self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
          self.gguf_writer.add_rope_scaling_factor(1.0)
  
+    _has_tok_embd = False
+
      def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
          del bid  # unused
  
-        new_name = self.map_tensor_name(name)
-
-        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
  
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            assert self.tensor_names is not None
+        new_name = self.map_tensor_name(name)
  
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                # copy tok_embd.weight to output.weight
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+        # assuming token_embd.weight is seen before output.weight
+        if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
+            # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
+            if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
+                logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
+                self.tensor_names.remove("transformer.wte.weight")
+        elif new_name == tok_embd_name:
+            self._has_tok_embd = True
  
-        return tensors
+        return [(new_name, data_torch)]
  
  
  @Model.register("InternLM2ForCausalLM")
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index d286176c1ff839f7a2b6f40ad9de603bc104b90c..17af8cc30b0cbfb378c151203b58f32ea2eb6455 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      // output
                      output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                      output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
  
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = layers[i];
@@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      // output
                      output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                      output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
  
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = layers[i];
@@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_CODESHELL:
                  {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if tok embd is NULL, init from output
+                    if (tok_embd == NULL) {
+                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
  
                      // output
                      output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
author	Sigbjørn Skjæret <redacted>
	Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)
committer	GitHub <redacted>
	Wed, 19 Mar 2025 08:08:49 +0000 (09:08 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history