model: support GLM-OCR (#19677)

author Xuan-Son Nguyen <redacted>

Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)

committer GitHub <redacted>

Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)
author Xuan-Son Nguyen <redacted>
Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)
committer GitHub <redacted>
Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 0e5d0f8589266541398e6508d9b3252ac731928e..7446e0df133496a4addb6578a9f3e3c06800d164 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4584,7 +4584,7 @@ class Qwen3VLVisionModel(MmprojModel):
          yield from super().modify_tensors(data_torch, name, bid)
  
  
-@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
+@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
  class Glm4VVisionModel(Qwen3VLVisionModel):
      def set_gguf_parameters(self):
          MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
@@ -8776,7 +8776,7 @@ class Glm4Model(TextModel):
              n_head = self.hparams["num_attention_heads"]
              n_kv_head = self.hparams["num_key_value_heads"]
              n_embd = self.hparams["hidden_size"]
-            head_dim = n_embd // n_head
+            head_dim = self.hparams.get("head_dim", n_embd // n_head)
              # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
              if name.endswith(("q_proj.weight", "q_proj.bias")):
                  data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
@@ -8785,6 +8785,27 @@ class Glm4Model(TextModel):
          yield from super().modify_tensors(data_torch, name, bid)
  
  
+@ModelBase.register("GlmOcrForConditionalGeneration")
+class GlmOCRModel(Glm4Model):
+    model_arch = gguf.MODEL_ARCH.GLM4
+    use_mrope = False
+    partial_rotary_factor = 0.5
+
+    # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer)
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+
  @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
  class Glm4MoeModel(TextModel):
      model_arch = gguf.MODEL_ARCH.GLM4_MOE
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index ea0fde920e87abee798661eb795d6de9aead8d62..727e4dd96eecee30a9487f0ec23f65a3cca51f8f 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -2660,6 +2660,13 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
          MODEL_TENSOR.FFN_UP,
          MODEL_TENSOR.ATTN_POST_NORM,
          MODEL_TENSOR.FFN_POST_NORM,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
      ],
      MODEL_ARCH.GLM4_MOE: [
          MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index b5c6e4bc588e84538ead1e29295b39cef2509624..228ba70aa34f484da3d83c93ab2ac5bd6f9793d9 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1404,6 +1404,7 @@ class TensorNameMap:
          MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
              "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
              "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
+            "visual.blocks.{bid}.attn.q_norm", # GLM-OCR
          ),
  
          MODEL_TENSOR.V_ENC_ATTN_K: (
@@ -1422,6 +1423,7 @@ class TensorNameMap:
          MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
              "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
              "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
+            "visual.blocks.{bid}.attn.k_norm", # GLM-OCR
          ),
  
          MODEL_TENSOR.V_ENC_ATTN_V: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index 416c17463ee8505c83039b011b8e889f57a27921..7e4da4e78cf35b7f87d90dc52a3fb5add25546fc 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1633,6 +1633,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                  LLM_TENSOR_FFN_DOWN,
                  LLM_TENSOR_ATTN_POST_NORM,
                  LLM_TENSOR_FFN_POST_NORM,
+                LLM_TENSOR_NEXTN_EH_PROJ,
+                LLM_TENSOR_NEXTN_EMBED_TOKENS,
+                LLM_TENSOR_NEXTN_ENORM,
+                LLM_TENSOR_NEXTN_HNORM,
+                LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+                LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
              };
          case LLM_ARCH_GLM4_MOE:
              return {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index c26584aa67f9293f6845b3e6ae5a70f9bbc4c55a..279a4d5ced062877f62c216ac1416dfecdd64f63 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1784,7 +1784,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
                  ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+                // NextN/MTP parameters (GLM-OCR)
+                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+                // TODO: when MTP is implemented, this should probably be updated if needed
+                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
                  switch (hparams.n_layer) {
+                    case 17: type = LLM_TYPE_1B; break; // GLM-OCR
                      case 40: type = LLM_TYPE_9B; break;
                      case 61: type = LLM_TYPE_32B; break;
                      default: type = LLM_TYPE_UNKNOWN;
@@ -5410,30 +5418,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      }
  
                      for (int i = 0; i < n_layer; ++i) {
+                        int flags = 0;
+                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+                            // skip all tensors in the NextN layers
+                            flags |= TENSOR_SKIP;
+                        }
+
                          auto & layer = layers[i];
  
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
  
                          if (layer.wqkv == nullptr) {
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
-                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
                          }
  
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
  
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
  
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, flags);
  
-                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
+
+                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+                            // Optional tensors
+                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+                        }
                      }
                  } break;
              case LLM_ARCH_GLM4_MOE:
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp

index 204aa3932af3bb071849cc792aa37aa691ac0a1d..bcd837b30d6220f4d109d9bb70a863a2f9a43a18 100644 (file)
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -29,7 +29,10 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
  
      ggml_tensor * inp_out_ids = build_inp_out_ids();
  
-    for (int il = 0; il < n_layer; ++il) {
+    // Only process up to last layer (skip final NextN layer)
+    // Final layer tensors are loaded but not processed in forward pass
+    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+    for (int il = 0; il < n_transformer_layers; ++il) {
          ggml_tensor * inpSA = inpL;
  
          // Pre-attention norm
@@ -100,7 +103,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
                      model.layers[il].wo, NULL,
                      Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
          }
-        if (il == n_layer - 1 && inp_out_ids) {
+        if (il == n_transformer_layers - 1 && inp_out_ids) {
              cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
              inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
          }
@@ -130,9 +133,13 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
              cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
              cb(cur, "post_mlp_norm", il);
          }
-        // Add residual connection after post-MLP norm
-        inpL = ggml_add(ctx0, cur, ffn_inp);
-        cb(inpL, "l_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
      }
      // Final norm
      cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 597289b7b663c97a6cbcf51425bb068a886ecac0..0796644f08eb919d0e533274b4da2a6e77e2b4ff 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -342,9 +342,17 @@ ggml_tensor * clip_graph::build_vit(
                      /* nb2    */ cur->nb[1],
                      /* offset */ ggml_row_size(cur->type, 2 * n_embd));
  
-                // TODO: q/k norm requires row size == n_embd, while here it's d_head
-                // we can add support in the future if needed
-                GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
+                if (layer.q_norm) {
+                    GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]);
+                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                    cb(Qcur, "Qcur_norm", il);
+                }
+
+                if (layer.k_norm) {
+                    GGML_ASSERT(layer.k_norm->ne[0] == Kcur->ne[0]);
+                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                    cb(Kcur, "Kcur_norm", il);
+                }
  
              } else {
                  // separate q, k, v
diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp

index f39b6922eb53c106396da333be2d858139f9b362..6f52df41ab0197d143f188f2e0eb07bed1f5273c 100644 (file)
--- a/tools/mtmd/models/glm4v.cpp
+++ b/tools/mtmd/models/glm4v.cpp
@@ -2,7 +2,6 @@
  
  ggml_cgraph * clip_graph_glm4v::build() {
      GGML_ASSERT(model.patch_bias != nullptr);
-    GGML_ASSERT(model.position_embeddings != nullptr);
      GGML_ASSERT(model.class_embedding == nullptr);
  
      const int batch_size = 1;
@@ -45,19 +44,22 @@ ggml_cgraph * clip_graph_glm4v::build() {
      // pos-conv norm
      inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
  
-    // calculate absolute position embedding and apply
-    ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
-    learned_pos_embd = ggml_cont_4d(
-        ctx0, learned_pos_embd,
-        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
-    learned_pos_embd = ggml_reshape_4d(
-        ctx0, learned_pos_embd,
-        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
-    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
-    learned_pos_embd = ggml_cont_3d(
-        ctx0, learned_pos_embd,
-        n_embd, n_patches_x * n_patches_y, batch_size);
-    cb(learned_pos_embd, "learned_pos_embd", -1);
+    ggml_tensor * learned_pos_embd = nullptr;
+    // Note: GLM-OCR does not have learned position embeddings
+    if (model.position_embeddings != nullptr) {
+        learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
+        learned_pos_embd = ggml_cont_4d(
+            ctx0, learned_pos_embd,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        learned_pos_embd = ggml_reshape_4d(
+            ctx0, learned_pos_embd,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+        learned_pos_embd = ggml_cont_3d(
+            ctx0, learned_pos_embd,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+        cb(learned_pos_embd, "learned_pos_embd", -1);
+    }
  
      auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
          return ggml_rope_multi(
author	Xuan-Son Nguyen <redacted>
	Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)
committer	GitHub <redacted>
	Wed, 18 Feb 2026 16:51:40 +0000 (17:51 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-arch.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/models/glm4.cpp		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/models/glm4v.cpp		patch \| blob \| history