llama : support GEGLU for jina-bert-v2 (#14090)

author Sigbjørn Skjæret <redacted>

Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)

committer GitHub <redacted>

Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)
author Sigbjørn Skjæret <redacted>
Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)
committer GitHub <redacted>
Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 7b9893c8a3e10e0b99cfe175eff45425001cd402..a208c42ba9a8b9f385dc5801db7f5535d669accf 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4798,25 +4798,6 @@ class OlmoeModel(TextModel):
  class JinaBertV2Model(BertModel):
      model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
  
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.intermediate_size = self.hparams["intermediate_size"]
-
-    def get_tensors(self):
-        for name, data in super().get_tensors():
-            if 'gated_layer' in name:
-                d1 = data[:self.intermediate_size, :]
-                name1 = name.replace('gated_layers', 'gated_layers_w')
-                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
-                d2 = data[self.intermediate_size:, :]
-                name2 = name.replace('gated_layers', 'gated_layers_v')
-                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
-                yield name1, d1
-                yield name2, d2
-                continue
-
-            yield name, data
-
      def set_vocab(self):
          tokenizer_class = 'BertTokenizer'
          with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -4832,14 +4813,6 @@ class JinaBertV2Model(BertModel):
          self.gguf_writer.add_add_bos_token(True)
          self.gguf_writer.add_add_eos_token(True)
  
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "bert.", remove the prefix
-        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-        if name.startswith("bert."):
-            name = name[5:]
-
-        return super().modify_tensors(data_torch, name, bid)
-
  
  @ModelBase.register("OpenELMForCausalLM")
  class OpenELMModel(TextModel):
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index 93dd1d8028f3dc2fca53165a8d3ef73b1135c837..439fc1afeeb0cb9cbd53b2912959b2f487f20581 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -333,7 +333,9 @@ class TensorNameMap:
              "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
              "encoder.layers.{bid}.mlp.fc1",                           # nomic-bert-moe
              "model.layers.{bid}.mlp.c_fc",                            # starcoder2
-            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2 (split up/gate, no longer used)
+            "encoder.layer.{bid}.mlp.gated_layers",                   # jina-bert-v2 (GEGLU)
+            "encoder.layer.{bid}.mlp.up_gated_layer",                 # jina-v2-code (GEGLU)
              "model.layers.{bid}.residual_mlp.w3",                     # arctic
              "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
              "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
@@ -370,7 +372,7 @@ class TensorNameMap:
              "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
              "model.layers.{bid}.feed_forward.w1",         # internlm2
              "encoder.layers.{bid}.mlp.fc12",              # nomic-bert
-            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2 (split up/gate, no longer used)
              "transformer.h.{bid}.mlp.linear_1",           # refact
              "model.layers.{bid}.residual_mlp.w1",         # arctic
              "transformer.h.{bid}.mlp.c_fc_0",             # exaone
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 27c9ab74be1125e6b7811c75a0a5ae92bf6be3a0..56082279119d89e2f97ee639c66d8ad89afbc4cf 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -650,6 +650,7 @@ ggml_tensor * llm_graph_context::build_ffn(
              {
                  // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
                  int64_t split_point = cur->ne[0] / 2;
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
                  ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
                  ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
  
@@ -663,7 +664,7 @@ ggml_tensor * llm_graph_context::build_ffn(
              {
                  // Split into two equal parts
                  int64_t split_point = cur->ne[0] / 2;
-                // TODO: these conts should not be needed
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
                  ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
                  ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
  
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index c41ee24507fca47f7cab1ca353c4edfd4ec7bd9a..f4a66390c79812a06a7999e4372333b494f0b673 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2224,8 +2224,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                          layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
  
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  
                          layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                          layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@@ -6043,7 +6043,7 @@ struct llm_build_bert : public llm_graph_context {
                          model.layers[il].ffn_gate, NULL,                        NULL,
                          model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                          NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                        model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
                  cb(cur, "ffn_out", il);
              } else {
                  cur = build_ffn(cur,
author	Sigbjørn Skjæret <redacted>
	Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)
committer	GitHub <redacted>
	Tue, 10 Jun 2025 16:02:08 +0000 (18:02 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history