From: Sigbjørn Skjæret Date: Sun, 8 Mar 2026 17:58:28 +0000 (+0100) Subject: graph : remove redundant scale_w parameter (#20235) X-Git-Tag: upstream/0.0.8611~367 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=35bee031e17ed2b2e8e7278b284a6c8cd120d9f8;p=pkg%2Fggml%2Fsources%2Fllama.cpp graph : remove redundant scale_w parameter (#20235) --- diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 99bd6796b..f4cb7dce1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1151,7 +1151,6 @@ ggml_tensor * llm_graph_context::build_ffn( return cur; } -// TODO remove redundant scale_w argument ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * cur, ggml_tensor * gate_inp, @@ -1163,7 +1162,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, - bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, int il, @@ -1180,7 +1178,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( n_expert_used, type_op, norm_w, - scale_w, w_scale, gating_op, il, @@ -1204,7 +1201,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, - bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, int il, @@ -1332,7 +1328,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); } - if (scale_w) { + if (w_scale != 0.0f && w_scale != 1.0f) { weights = ggml_scale(ctx0, weights, w_scale); cb(weights, "ffn_moe_weights_scaled", il); } diff --git a/src/llama-graph.h b/src/llama-graph.h index e8f006977..7f6c9e963 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -810,7 +810,6 @@ struct llm_graph_context { int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, - bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, int il, @@ -832,7 +831,6 @@ struct llm_graph_context { int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, - bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, int il, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ef9c2dfc5..e18cca052 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1570,6 +1570,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); switch (hparams.n_ff_exp) { case 1408: type = LLM_TYPE_16B; break; @@ -2076,6 +2077,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); switch (hparams.n_layer) { diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 6a752a403..11f5ea2c2 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, // norm_w (route_norm=True) - hparams.expert_weights_scale, // scale_w hparams.expert_weights_scale, // w_scale (route_scale=2.826) (llama_expert_gating_func_type) hparams.expert_gating_func, il); diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index e8f028a72..d7db06de1 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index ed56b9c47..25e3369c3 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_ nullptr, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - false, hparams.expert_weights_scale, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index a72a5a7ca..81906cecb 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/bert.cpp b/src/models/bert.cpp index bca0e254f..17efdafec 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params // feed-forward network if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { // MoE branch - cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr, - model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used, - LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, hparams.n_expert_used, + LLM_FFN_GELU, false, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); cb(cur, "ffn_moe_out", il); } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index 6d2a0ebf1..5c7f10843 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp index 17866c0d8..57cb1724f 100644 --- a/src/models/deepseek.cpp +++ b/src/models/deepseek.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, hparams.expert_weights_scale, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index be81709c5..abd54b765 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -216,7 +216,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il, nullptr, diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index bcbd9af50..0bcf3fe0d 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp index 0d96d14e6..ed781d507 100644 --- a/src/models/ernie4-5-moe.cpp +++ b/src/models/ernie4-5-moe.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -103,7 +101,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index efc31d694..a7396829c 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; @@ -100,7 +99,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index d51cf0741..97a65f411 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(routed_out, "ffn_moe_out", il); diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 726ecdcca..d9b1c06da 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -160,7 +159,7 @@ ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor * cur, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 18748e9c2..fd97116ed 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -1,6 +1,5 @@ #include "models.h" - llm_build_granite::llm_build_granite( const llama_model & model, const llm_graph_params & params) @@ -175,7 +174,7 @@ ggml_tensor * llm_build_granite::build_layer_ffn( nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 3c54dfee6..24232604b 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -99,7 +99,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params nullptr, n_expert, n_expert_used, LLM_FFN_GELU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 56b6db9a3..2081f0685 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -90,7 +88,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, probs); @@ -106,7 +104,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap nullptr, n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, probs); diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index 77e39de5b..cee2b541b 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -119,8 +119,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll n_expert, n_expert_used, LLM_FFN_SILU, true, // norm_topk_prob - false, - 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur_moe, "ffn_moe_out", il); diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index ceab58174..1d482e425 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -76,7 +76,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index d178ca8b7..8d4b95e51 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -1,5 +1,4 @@ #include "models.h" -#include "ggml.h" #include "llama-memory-recurrent.h" @@ -341,7 +340,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll hparams.n_expert, hparams.n_expert_used, LLM_FFN_SILU, true, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index cf01ad625..8ca8e6c8e 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -23,10 +23,16 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_ }; auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * { return build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, - static_cast(hparams.expert_gating_func), il); + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + static_cast(hparams.expert_gating_func), + il); }; auto build_attn_block = [&model, this](ggml_tensor * cur, ggml_tensor * inp_pos, diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 5f64686f5..3bb9943f4 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -90,7 +90,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/llama-iswa.cpp b/src/models/llama-iswa.cpp index 61dd2c179..40dc2427a 100644 --- a/src/models/llama-iswa.cpp +++ b/src/models/llama-iswa.cpp @@ -134,7 +134,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_ nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il); diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 42b5fcdf4..279f2e301 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -130,7 +130,7 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_gra nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/mimo2-iswa.cpp b/src/models/mimo2-iswa.cpp index edc87cc9f..06956915e 100644 --- a/src/models/mimo2-iswa.cpp +++ b/src/models/mimo2-iswa.cpp @@ -1,4 +1,3 @@ - #include "models.h" llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { @@ -88,10 +87,17 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_ cb(cur, "ffn_out", il); } else { // MoE branch - cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, - 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il); + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, + il); cb(cur, "ffn_moe_out", il); } diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index f7001badf..fbeed8eab 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -1,4 +1,3 @@ - #include "models.h" llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { @@ -91,7 +90,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 0b6722359..49734989d 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -127,7 +127,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index 347f28948..fa2b55a28 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -124,7 +124,7 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_RELU_SQR, hparams.expert_weights_norm, - hparams.expert_weights_scale, hparams.expert_weights_scale, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index b8b6988f8..a05b892c7 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index dbe3ca185..403f130bc 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, nullptr, n_expert, n_expert_used, LLM_FFN_SWIGLU_OAI_MOE, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index c8e5da33d..803e374aa 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -114,7 +114,7 @@ llm_build_phi3::llm_build_phi3(const llama_model & model, const llm_graph_ nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 49142b712..e19061334 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -94,7 +94,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 17291ec23..fe081af21 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -375,11 +375,15 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int ggml_tensor * moe_out = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, nullptr, - n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 888534fb3..9cf1ec03c 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -91,7 +91,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index f2621200f..9b8164ddf 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -475,11 +475,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int // MoE branch ggml_tensor * moe_out = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, nullptr, - n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index e5e1a2150..f347c5d6f 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -99,7 +99,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index 46b3dc3ef..de9ab1c65 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -93,7 +93,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, - false, 0.0, + hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(moe_out, "ffn_moe_out", il); diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 4c497ca76..8723905e8 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -93,7 +93,7 @@ llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, nullptr, n_expert, n_expert_used, LLM_FFN_RELU, true, - false, 0.0, + hparams.expert_weights_scale, static_cast(hparams.expert_gating_func), il, probs); diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index f8737815a..aa8e98b73 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -119,9 +119,6 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll cb(cur, "ffn_out", il); } else { // MoE routed experts - const bool norm_w = hparams.expert_weights_norm; - const float w_scale = hparams.expert_weights_scale; - const bool scale_w = w_scale != 0.0f; ggml_tensor * moe_out = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, @@ -129,8 +126,8 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll model.layers[il].ffn_down_exps, model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, - LLM_FFN_SILU, - norm_w, scale_w, w_scale, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il);