model : full modern bert support (#18330)

author Ryan Mangeno <redacted>

Thu, 19 Feb 2026 07:52:21 +0000 (02:52 -0500)

committer GitHub <redacted>

Thu, 19 Feb 2026 07:52:21 +0000 (08:52 +0100)
author Ryan Mangeno <redacted>
Thu, 19 Feb 2026 07:52:21 +0000 (02:52 -0500)
committer GitHub <redacted>
Thu, 19 Feb 2026 07:52:21 +0000 (08:52 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 7446e0df133496a4addb6578a9f3e3c06800d164..2e41a2eccec25efa830d2929d51375a1ce6f913f 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -11003,13 +11003,17 @@ class ModernBertModel(BertModel):
          self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
  
      def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # these layers act as MLM head, so we don't need them
-        if name.startswith("decoder."):
-            return
-
          if name.startswith("model."):
              name = name[6:]
  
+        if self.cls_out_labels:
+            # For BertForSequenceClassification (direct projection layer)
+            if name == "classifier.weight":
+                name = "classifier.out_proj.weight"
+
+            if name == "classifier.bias":
+                name = "classifier.out_proj.bias"
+
          yield from super().modify_tensors(data_torch, name, bid)
  
  
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index 727e4dd96eecee30a9487f0ec23f65a3cca51f8f..4b0f81ecb24b29467615e7f79ab1ea03beebbfd4 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -652,6 +652,7 @@ class MODEL_TENSOR(IntEnum):
      ENC_OUTPUT_NORM      = auto()
      CLS                  = auto() # classifier
      CLS_OUT              = auto() # classifier output projection
+    CLS_NORM             = auto()
      CONV1D               = auto()
      CONVNEXT_DW          = auto()
      CONVNEXT_NORM        = auto()
@@ -1088,6 +1089,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
      MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
      MODEL_TENSOR.CLS:                       "cls",
      MODEL_TENSOR.CLS_OUT:                   "cls.output",
+    MODEL_TENSOR.CLS_NORM:                  "cls.norm",
      MODEL_TENSOR.CONV1D:                    "conv1d",
      MODEL_TENSOR.CONVNEXT_DW:               "convnext.{bid}.dw",
      MODEL_TENSOR.CONVNEXT_NORM:             "convnext.{bid}.norm",
@@ -1507,6 +1509,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
          MODEL_TENSOR.FFN_NORM,
          MODEL_TENSOR.CLS,
          MODEL_TENSOR.CLS_OUT,
+        MODEL_TENSOR.CLS_NORM,
      ],
      MODEL_ARCH.NOMIC_BERT: [
          MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index 228ba70aa34f484da3d83c93ab2ac5bd6f9793d9..5fc75c52eb8d37763f07ab92ba426fdfb6503210 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1240,6 +1240,10 @@ class TensorNameMap:
          MODEL_TENSOR.CLS_OUT: (
              "classifier.out_proj", # roberta
          ),
+
+        MODEL_TENSOR.CLS_NORM: (
+            "head.norm", # modern-bert
+        ),
          #############################################################################
  
          MODEL_TENSOR.CONVNEXT_DW: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index 7e4da4e78cf35b7f87d90dc52a3fb5add25546fc..965066cb668de0de73f0222b5d47b1fab40faca3 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -367,6 +367,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
      { LLM_TENSOR_TOKEN_TYPES,                            "token_types" },
      { LLM_TENSOR_CLS,                                    "cls" },
      { LLM_TENSOR_CLS_OUT,                                "cls.output" },
+    { LLM_TENSOR_CLS_NORM,                               "cls.norm" },
      { LLM_TENSOR_ENC_OUTPUT_NORM,                        "enc.output_norm" },
      { LLM_TENSOR_FFN_GATE_INP_SHEXP,                     "blk.%d.ffn_gate_inp_shexp" },
      { LLM_TENSOR_SSM_A_NOSCAN,                           "blk.%d.ssm_a" },
@@ -828,6 +829,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                  LLM_TENSOR_FFN_NORM,
                  LLM_TENSOR_CLS,
                  LLM_TENSOR_CLS_OUT,
+                LLM_TENSOR_CLS_NORM,
              };
          case LLM_ARCH_JINA_BERT_V2:
              return {
@@ -2518,6 +2520,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CLS_NORM,                   {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
      {LLM_TENSOR_DENSE_2_OUT,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
      {LLM_TENSOR_DENSE_3_OUT,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
      {LLM_TENSOR_OUTPUT_NORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h

index 521944370b4471b4c8459c2d28a63c46a1ccace0..e37f634e373903a93d91068130be555cb5e618df 100644 (file)
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -497,6 +497,7 @@ enum llm_tensor {
      LLM_TENSOR_ENC_OUTPUT_NORM,
      LLM_TENSOR_CLS,
      LLM_TENSOR_CLS_OUT,
+    LLM_TENSOR_CLS_NORM,
      LLM_TENSOR_CONV1D,
      LLM_TENSOR_CONVNEXT_DW,
      LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index fc05989aa556b2dc443e2c817a0f6c62ef509790..7f4b4a933eaf37d36974077ccccf3896aeaa1bc3 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2761,6 +2761,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
      llama_set_param(model->cls_b,           param_filter, param_filter_ud);
      llama_set_param(model->cls_out,         param_filter, param_filter_ud);
      llama_set_param(model->cls_out_b,       param_filter, param_filter_ud);
+    llama_set_param(model->cls_norm,        param_filter, param_filter_ud);
  
      for (struct llama_layer & layer : model->layers) {
          for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 70d8ff02a922fa510bd6b79e9ebb67d6210fb782..692724987379c74eaac61ab6dc353a83a3a0ac73 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -185,7 +185,10 @@ bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
  }
  
  void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+    if (cparams.embeddings   &&
+       (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN ||
+        cparams.pooling_type == LLAMA_POOLING_TYPE_RANK )) {
+
          const int64_t n_tokens     = ubatch->n_tokens;
          const int64_t n_seq_tokens = ubatch->n_seq_tokens;
          const int64_t n_seqs_unq   = ubatch->n_seqs_unq;
@@ -2437,7 +2440,8 @@ void llm_graph_context::build_pooling(
          ggml_tensor * cls,
          ggml_tensor * cls_b,
          ggml_tensor * cls_out,
-        ggml_tensor * cls_out_b) const {
+        ggml_tensor * cls_out_b,
+        ggml_tensor * cls_norm) const {
      if (!cparams.embeddings) {
          return;
      }
@@ -2476,8 +2480,15 @@ void llm_graph_context::build_pooling(
              } break;
          case LLAMA_POOLING_TYPE_RANK:
              {
-                ggml_tensor * inp_cls = build_inp_cls();
-                cur = ggml_get_rows(ctx0, inp, inp_cls);
+                if (arch == LLM_ARCH_MODERN_BERT) {
+                    // modern bert gte reranker builds mean first then applies prediction head and classifier
+                    // https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modular_modernbert.py#L1404-1411
+                    ggml_tensor * inp_mean = build_inp_mean();
+                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+                } else {
+                    ggml_tensor * inp_cls = build_inp_cls();
+                    cur = ggml_get_rows(ctx0, inp, inp_cls);
+                }
  
                  // classification head
                  // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
@@ -2486,7 +2497,15 @@ void llm_graph_context::build_pooling(
                      if (cls_b) {
                          cur = ggml_add(ctx0, cur, cls_b);
                      }
-                    cur = ggml_tanh(ctx0, cur);
+                    if (arch == LLM_ARCH_MODERN_BERT) {
+                        cur = ggml_gelu(ctx0, cur);
+                    } else {
+                        cur = ggml_tanh(ctx0, cur);
+                    }
+                    if (cls_norm) {
+                        // head norm
+                        cur = build_norm(cur, cls_norm, NULL, LLM_NORM, -1);
+                    }
                  }
  
                  // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
diff --git a/src/llama-graph.h b/src/llama-graph.h

index 1d69ff1a6fcdf5a92cca174aa3c9ebd3d0afdd34..74a4685121dc8aacbe91f48cb228dfc1a2fbe465 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -1000,7 +1000,8 @@ struct llm_graph_context {
              ggml_tensor * cls,
              ggml_tensor * cls_b,
              ggml_tensor * cls_out,
-            ggml_tensor * cls_out_b) const;
+            ggml_tensor * cls_out_b,
+            ggml_tensor * cls_norm) const;
  
      //
      // sampling (backend sampling)
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp

index 36e353074e0366e506e0a55fe63b36233887d582..676efeda709342b88b8448790027c408024736bb 100644 (file)
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -271,6 +271,7 @@ void llama_model_saver::add_tensors_from_model() {
      add_tensor(model.cls_b);
      add_tensor(model.cls_out);
      add_tensor(model.cls_out_b);
+    add_tensor(model.cls_norm);
  
      for (const struct llama_layer & layer : model.layers) {
          for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 279a4d5ced062877f62c216ac1416dfecdd64f63..2aebaddf27dcf2559180f35e3edf323f49de0507 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -908,7 +908,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
  
                      ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
                      ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
+                    hparams.set_swa_pattern(swa_period, true);
                  } else {
                      hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                  }
@@ -3513,9 +3513,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                      }
  
-                    cls       = create_tensor(tn(LLM_TENSOR_CLS,     "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+                    cls       = create_tensor(tn(LLM_TENSOR_CLS,      "weight"), {n_embd, n_embd},            TENSOR_NOT_REQUIRED);
+                    cls_norm  = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd},                    TENSOR_NOT_REQUIRED);
  
                  } break;
              case LLM_ARCH_NEO_BERT:
@@ -8734,7 +8735,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
      }
  
      // add on pooling layer
-    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
+    llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
  
      // add backend sampling layers (if any)
      llm->build_sampling();
diff --git a/src/llama-model.h b/src/llama-model.h

index b3505914293b47336273ca49d033a06102ae684e..5ffba24fe98feb93737aae4c8b23d93bf369b5ff 100644 (file)
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -475,6 +475,7 @@ struct llama_model {
      struct ggml_tensor * cls_b     = nullptr;
      struct ggml_tensor * cls_out   = nullptr;
      struct ggml_tensor * cls_out_b = nullptr;
+    struct ggml_tensor * cls_norm  = nullptr;
  
      struct ggml_tensor * conv1d   = nullptr;
      struct ggml_tensor * conv1d_b = nullptr;
diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp

index bb12ed819f735122a5ce3068bf26e8c05809e298..32066c712b494d29ae5bf0e22d7e3d623a103dbb 100644 (file)
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@@ -104,13 +104,6 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
              LLM_NORM, -1);
      cb(cur, "final_norm_out", -1);
  
-    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        // extracting cls token
-        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
-        cb(cur, "cls_pooled_embd", -1);
-    }
-
-    cb(cur, "res_embd", -1);
      res->t_embd = cur;
      ggml_build_forward_expand(gf, cur);
  }
author	Ryan Mangeno <redacted>
	Thu, 19 Feb 2026 07:52:21 +0000 (02:52 -0500)
committer	GitHub <redacted>
	Thu, 19 Feb 2026 07:52:21 +0000 (08:52 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-arch.cpp		patch \| blob \| history
src/llama-arch.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history
src/llama-graph.h		patch \| blob \| history
src/llama-model-saver.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-model.h		patch \| blob \| history
src/models/modern-bert.cpp		patch \| blob \| history