From: Junwon Hwang <redacted>
Date: Wed, 14 Jan 2026 18:38:21 +0000 (+0900)
Subject: model : clean up and fix EXAONE-MoE configuration (#18840)
X-Git-Tag: upstream/0.0.8067~330
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=8fb717557638f819e668e87f6d7dc0f39eb09c68;p=pkg%2Fggml%2Fsources%2Fllama.cpp

model : clean up and fix EXAONE-MoE configuration (#18840)

* Fix mismatch of EXAONE-MoE configuration

* ensure gating func is set, cleanup

---------

Co-authored-by: Sigbjørn Skjæret <redacted>
---

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index be83e3108..464ecbaab 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8772,11 +8772,7 @@ class ExaoneMoEModel(Exaone4Model):
         self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
         n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
         self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
-        # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE,
-        # so that we can convert MTP weights to GGUF format for speculative decoding.
-        # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now.
-        # Will be updated when HF config is updated.
-        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1))
+        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
 
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 75f969180..eaedc66b6 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
 
                 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, true);
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,                hparams.n_expert_groups, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_used, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);