yield from super().modify_tensors(data_torch, name, bid)
-@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
+@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
class Glm4VVisionModel(Qwen3VLVisionModel):
def set_gguf_parameters(self):
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams["num_key_value_heads"]
n_embd = self.hparams["hidden_size"]
- head_dim = n_embd // n_head
+ head_dim = self.hparams.get("head_dim", n_embd // n_head)
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
yield from super().modify_tensors(data_torch, name, bid)
+@ModelBase.register("GlmOcrForConditionalGeneration")
+class GlmOCRModel(Glm4Model):
+ model_arch = gguf.MODEL_ARCH.GLM4
+ use_mrope = False
+ partial_rotary_factor = 0.5
+
+ # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer)
+ self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ # NextN/MTP prediction layers
+ if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+ self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
class Glm4MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_POST_NORM,
+ # NextN/MTP tensors - preserved but unused
+ MODEL_TENSOR.NEXTN_EH_PROJ,
+ MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+ MODEL_TENSOR.NEXTN_ENORM,
+ MODEL_TENSOR.NEXTN_HNORM,
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.GLM4_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
+ "visual.blocks.{bid}.attn.q_norm", # GLM-OCR
),
MODEL_TENSOR.V_ENC_ATTN_K: (
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
+ "visual.blocks.{bid}.attn.k_norm", # GLM-OCR
),
MODEL_TENSOR.V_ENC_ATTN_V: (
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};
case LLM_ARCH_GLM4_MOE:
return {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+ // NextN/MTP parameters (GLM-OCR)
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ // TODO: when MTP is implemented, this should probably be updated if needed
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
switch (hparams.n_layer) {
+ case 17: type = LLM_TYPE_1B; break; // GLM-OCR
case 40: type = LLM_TYPE_9B; break;
case 61: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
}
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+ // Optional tensors
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+ }
}
} break;
case LLM_ARCH_GLM4_MOE:
ggml_tensor * inp_out_ids = build_inp_out_ids();
- for (int il = 0; il < n_layer; ++il) {
+ // Only process up to last layer (skip final NextN layer)
+ // Final layer tensors are loaded but not processed in forward pass
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL;
// Pre-attention norm
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1 && inp_out_ids) {
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "post_mlp_norm", il);
}
- // Add residual connection after post-MLP norm
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
// Final norm
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
- // TODO: q/k norm requires row size == n_embd, while here it's d_head
- // we can add support in the future if needed
- GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
+ if (layer.q_norm) {
+ GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]);
+ Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+ cb(Qcur, "Qcur_norm", il);
+ }
+
+ if (layer.k_norm) {
+ GGML_ASSERT(layer.k_norm->ne[0] == Kcur->ne[0]);
+ Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+ cb(Kcur, "Kcur_norm", il);
+ }
} else {
// separate q, k, v
ggml_cgraph * clip_graph_glm4v::build() {
GGML_ASSERT(model.patch_bias != nullptr);
- GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
// pos-conv norm
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
- // calculate absolute position embedding and apply
- ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
- learned_pos_embd = ggml_cont_4d(
- ctx0, learned_pos_embd,
- n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
- learned_pos_embd = ggml_reshape_4d(
- ctx0, learned_pos_embd,
- n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
- learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
- learned_pos_embd = ggml_cont_3d(
- ctx0, learned_pos_embd,
- n_embd, n_patches_x * n_patches_y, batch_size);
- cb(learned_pos_embd, "learned_pos_embd", -1);
+ ggml_tensor * learned_pos_embd = nullptr;
+ // Note: GLM-OCR does not have learned position embeddings
+ if (model.position_embeddings != nullptr) {
+ learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
+ learned_pos_embd = ggml_cont_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+ learned_pos_embd = ggml_reshape_4d(
+ ctx0, learned_pos_embd,
+ n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+ learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+ learned_pos_embd = ggml_cont_3d(
+ ctx0, learned_pos_embd,
+ n_embd, n_patches_x * n_patches_y, batch_size);
+ cb(learned_pos_embd, "learned_pos_embd", -1);
+ }
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return ggml_rope_multi(