From: Sigbjørn Skjæret Date: Wed, 19 Mar 2025 08:08:49 +0000 (+0100) Subject: llama : add support for GPT2, Bloom and CodeShell tied word embeddings (#12456) X-Git-Tag: upstream/0.0.5028~105 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=108e53c2f1b57b79c8bcf9773fba5a82e2f1eeb5;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : add support for GPT2, Bloom and CodeShell tied word embeddings (#12456) * Add support for GPT2, Bloom and CodeShell tied word embeddings * Deduplicate tied word embeddings weights * Workaround for incorrect weight map It appears transformer.wte.weight is in the weight map even though the weights are not there, remove it if output weights are encountered first. * check++ * fatfingers-- --- diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7a2ef4c7..7574218e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -180,7 +180,8 @@ class Model: extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") else: raise ValueError("Mismatch between weight map and model parts for tensor names:\n" f"Missing tensors: {missing}\n" @@ -1099,13 +1100,6 @@ class BloomModel(Model): tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -2423,10 +2417,6 @@ class GPT2Model(Model): tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -2756,21 +2746,26 @@ class CodeShellModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) + _has_tok_embd = False + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - new_name = self.map_tensor_name(name) - - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None + new_name = self.map_tensor_name(name) - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + # assuming token_embd.weight is seen before output.weight + if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - return tensors + return [(new_name, data_torch)] @Model.register("InternLM2ForCausalLM") diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d286176c..17af8cc3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_CODESHELL: { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + // if tok embd is NULL, init from output + if (tok_embd == NULL) { + tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);