From: Junwon Hwang Date: Wed, 14 Jan 2026 18:38:21 +0000 (+0900) Subject: model : clean up and fix EXAONE-MoE configuration (#18840) X-Git-Tag: upstream/0.0.8067~330 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=8fb717557638f819e668e87f6d7dc0f39eb09c68;p=pkg%2Fggml%2Fsources%2Fllama.cpp model : clean up and fix EXAONE-MoE configuration (#18840) * Fix mismatch of EXAONE-MoE configuration * ensure gating func is set, cleanup --------- Co-authored-by: Sigbjørn Skjæret --- diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index be83e3108..464ecbaab 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8772,11 +8772,7 @@ class ExaoneMoEModel(Exaone4Model): self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) self.gguf_writer.add_leading_dense_block_count(n_dense_layer) - # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE, - # so that we can convert MTP weights to GGUF format for speculative decoding. - # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now. - # Will be updated when HF config is updated. - self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1)) + self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 75f969180..eaedc66b6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, true); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); - ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); - ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);