model : clean up and fix EXAONE-MoE configuration (#18840)

author Junwon Hwang <redacted>

Wed, 14 Jan 2026 18:38:21 +0000 (03:38 +0900)

committer GitHub <redacted>

Wed, 14 Jan 2026 18:38:21 +0000 (19:38 +0100)
author Junwon Hwang <redacted>
Wed, 14 Jan 2026 18:38:21 +0000 (03:38 +0900)
committer GitHub <redacted>
Wed, 14 Jan 2026 18:38:21 +0000 (19:38 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index be83e3108e81b086659961a9b41279bb028d8ad2..464ecbaab915242c7400dae7210734d6923382c2 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8772,11 +8772,7 @@ class ExaoneMoEModel(Exaone4Model):
          self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
          n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
          self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
-        # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE,
-        # so that we can convert MTP weights to GGUF format for speculative decoding.
-        # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now.
-        # Will be updated when HF config is updated.
-        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1))
+        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
  
          self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
  
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 75f9691807c5ccba4e352db61fef02aac1d125d8..eaedc66b63ea12a779c78d1573d55e33a5301b7c 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  
                  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, true);
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,                hparams.n_expert_groups, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_used, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
author	Junwon Hwang <redacted>
	Wed, 14 Jan 2026 18:38:21 +0000 (03:38 +0900)
committer	GitHub <redacted>
	Wed, 14 Jan 2026 18:38:21 +0000 (19:38 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history