graph : remove redundant scale_w parameter (#20235)

author Sigbjørn Skjæret <redacted>

Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)

committer GitHub <redacted>

Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)
author Sigbjørn Skjæret <redacted>
Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)
committer GitHub <redacted>
Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 99bd6796bcc70684db3bc64c4b2abde7d13a8510..f4cb7dce15e0869002d7ea61dd34d6bd18814344 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1151,7 +1151,6 @@ ggml_tensor * llm_graph_context::build_ffn(
      return cur;
  }
  
-// TODO remove redundant scale_w argument
  ggml_tensor * llm_graph_context::build_moe_ffn(
           ggml_tensor * cur,
           ggml_tensor * gate_inp,
@@ -1163,7 +1162,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
               int64_t   n_expert_used,
       llm_ffn_op_type   type_op,
                  bool   norm_w,
-                bool   scale_w,
                 float   w_scale,
           llama_expert_gating_func_type gating_op,
                   int   il,
@@ -1180,7 +1178,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          n_expert_used,
          type_op,
          norm_w,
-        scale_w,
          w_scale,
          gating_op,
          il,
@@ -1204,7 +1201,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
               int64_t   n_expert_used,
       llm_ffn_op_type   type_op,
                  bool   norm_w,
-                bool   scale_w,
                 float   w_scale,
          llama_expert_gating_func_type gating_op,
                   int   il,
@@ -1332,7 +1328,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
  
          weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
      }
-    if (scale_w) {
+    if (w_scale != 0.0f && w_scale != 1.0f) {
          weights = ggml_scale(ctx0, weights, w_scale);
          cb(weights, "ffn_moe_weights_scaled", il);
      }
diff --git a/src/llama-graph.h b/src/llama-graph.h

index e8f006977d2e4b3d02af4a1ce8a90bfc55744788..7f6c9e96356be9fedb26773683298b4467814506 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -810,7 +810,6 @@ struct llm_graph_context {
                   int64_t   n_expert_used,
           llm_ffn_op_type   type_op,
                      bool   norm_w,
-                    bool   scale_w,
                     float   w_scale,
              llama_expert_gating_func_type gating_op,
                       int   il,
@@ -832,7 +831,6 @@ struct llm_graph_context {
                   int64_t   n_expert_used,
           llm_ffn_op_type   type_op,
                      bool   norm_w,
-                    bool   scale_w,
                     float   w_scale,
              llama_expert_gating_func_type gating_op,
                       int   il,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index ef9c2dfc58ed001383f464e99a586be1abf1fea9..e18cca0524e1ae713b064582d94f324dbebe6f18 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1570,6 +1570,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
  
                  switch (hparams.n_ff_exp) {
                      case 1408: type = LLM_TYPE_16B; break;
@@ -2076,6 +2077,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
  
                  switch (hparams.n_layer) {
diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp

index 6a752a403f6f12beab2446c8df741f7cfcfefe7d..11f5ea2c27178ac0e49c41a9f5c2a6a4590a1868 100644 (file)
--- a/src/models/afmoe.cpp
+++ b/src/models/afmoe.cpp
@@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
                      n_expert, n_expert_used,
                      LLM_FFN_SILU,
                      hparams.expert_weights_norm,           // norm_w (route_norm=True)
-                    hparams.expert_weights_scale,          // scale_w
                      hparams.expert_weights_scale,          // w_scale (route_scale=2.826)
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp

index e8f028a723e3e5c9801c0af658818bd4cd1a0429..d7db06de1d0a25eab2966849f9d56d96ad8dddf2 100644 (file)
--- a/src/models/arctic.cpp
+++ b/src/models/arctic.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
  
@@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp

index ed56b9c4713707c7049e2de25ff064c3366fbf3b..25e3369c313ecd6868316f1198edd1e4268305bf 100644 (file)
--- a/src/models/bailingmoe.cpp
+++ b/src/models/bailingmoe.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, hparams.expert_weights_norm,
-                    false, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp

index a72a5a7cab35421fb5f6c462be6032bfe7e81d42..81906cecb53df8372a8acdf9a250d119a3e43007 100644 (file)
--- a/src/models/bailingmoe2.cpp
+++ b/src/models/bailingmoe2.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/bert.cpp b/src/models/bert.cpp

index bca0e254fc51bc875abe8a65c224b37b6d89476e..17efdafec3f5a269b6bfede2c3bc2ca05e54f950 100644 (file)
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
          // feed-forward network
          if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
              // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
-                                model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
-                                LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    nullptr,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    hparams.n_expert, hparams.n_expert_used,
+                    LLM_FFN_GELU, false,
+                    hparams.expert_weights_scale,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
              cb(cur, "ffn_moe_out", il);
          } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
                     model.arch == LLM_ARCH_JINA_BERT_V3) {
diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp

index 6d2a0ebf1b7eca41639daff30db3dc44afe0a223..5c7f108437e36b06011fa8c88ef91a6d803d71be 100644 (file)
--- a/src/models/dbrx.cpp
+++ b/src/models/dbrx.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp

index 17866c0d88e2e9b4fa5ad8df3a16a8152f6022ea..57cb1724f259f7a9daf881f5cba390179d20352d 100644 (file)
--- a/src/models/deepseek.cpp
+++ b/src/models/deepseek.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp

index be81709c50ba81abe3dc0726e18ab9cff2fee67b..abd54b7656f6a975cdfca45a9169667c9aeee6c3 100644 (file)
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -216,7 +216,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il,
                  nullptr,
diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp

index bcbd9af5045db45a91d6a871ac4ff444c98bb83e..0bcf3fe0de5bf8c20b47dd14070bc97cba7f0ab3 100644 (file)
--- a/src/models/dots1.cpp
+++ b/src/models/dots1.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp

index 0d96d14e6fd328e37230ef219c7c44285464e325..ed781d5076cff5b3bc10f540d7868ff6e7e5a23a 100644 (file)
--- a/src/models/ernie4-5-moe.cpp
+++ b/src/models/ernie4-5-moe.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -103,7 +101,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
                                          model.layers[il].ffn_exp_probs_b,
                                          n_expert, n_expert_used,
                                          LLM_FFN_SILU, true,
-                                        false, 0.0,
+                                        hparams.expert_weights_scale,
                                          LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                                          il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp

index efc31d6942d67a124378794178aaa630ef59ccd9..a7396829ca8a89c44859bd8b19b3e4e285d3373e 100644 (file)
--- a/src/models/exaone-moe.cpp
+++ b/src/models/exaone-moe.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -100,7 +99,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp

index d51cf07412d80b00dfd8ac7135b71b0465bc9b22..97a65f4116602db6ac05c7f28fec36bafe8d7caf 100644 (file)
--- a/src/models/glm4-moe.cpp
+++ b/src/models/glm4-moe.cpp
@@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, hparams.expert_weights_norm,
-                    hparams.expert_weights_scale, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
              cb(routed_out, "ffn_moe_out", il);
diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp

index 726ecdcca776e5aa8d45796f83cbd160ef43f949..d9b1c06da82e5a65667cf1b433ef1431981c0cbd 100644 (file)
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
      llm_build_mamba_base(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -160,7 +159,7 @@ ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor *       cur,
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/granite.cpp b/src/models/granite.cpp

index 18748e9c26cf37090175beee2b6bb8dc1f354680..fd97116ed3f8d33b59bcc89cfdb4d71558fbe261 100644 (file)
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_granite::llm_build_granite(
      const llama_model & model,
      const llm_graph_params & params)
@@ -175,7 +174,7 @@ ggml_tensor * llm_build_granite::build_layer_ffn(
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/grok.cpp b/src/models/grok.cpp

index 3c54dfee63684551141d53c8322b3b9ca18a01e6..24232604b37aea68a4716200f625837b1f1b49b3 100644 (file)
--- a/src/models/grok.cpp
+++ b/src/models/grok.cpp
@@ -99,7 +99,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_GELU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp

index 56b6db9a3d072d8b62c53a725fa0757d528c10e2..2081f068569d1a73d2cc74e725b91f4cc0d070bc 100644 (file)
--- a/src/models/grovemoe.cpp
+++ b/src/models/grovemoe.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head    = hparams.n_embd_head_v;
@@ -90,7 +88,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il,
                  probs);
@@ -106,7 +104,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
                      nullptr,
                      n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il,
                      probs);
diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp

index 77e39de5b8ba265afeb68bc8892e98eada78591f..cee2b541b73ecb8b76d3b7e29af8c9fc700ac465 100644 (file)
--- a/src/models/hunyuan-moe.cpp
+++ b/src/models/hunyuan-moe.cpp
@@ -119,8 +119,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
                  n_expert, n_expert_used,
                  LLM_FFN_SILU,
                  true, // norm_topk_prob
-                false,
-                0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur_moe, "ffn_moe_out", il);
diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp

index ceab58174079be9810f67d74d76ba7f814bed8c2..1d482e425a62718b690d79bcbc3f1f8f9bf633b1 100644 (file)
--- a/src/models/jamba.cpp
+++ b/src/models/jamba.cpp
@@ -76,7 +76,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp

index d178ca8b7fde15774a87dfd7b269c8ee18406be6..8d4b95e5192ea63770850e771060ee64b49e5014 100644 (file)
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,5 +1,4 @@
  #include "models.h"
-#include "ggml.h"
  
  #include "llama-memory-recurrent.h"
  
@@ -341,7 +340,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                  hparams.n_expert,
                  hparams.n_expert_used,
                  LLM_FFN_SILU, true,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp

index cf01ad62557784c44d6c72db76d47d2fccc91559..8ca8e6c8e2ce24a3a279a17af6ef1bbd5bf99063 100644 (file)
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@@ -23,10 +23,16 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
      };
      auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
          return build_moe_ffn(cur,
-                            model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                            model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                            model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
-                            static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true,
+                hparams.expert_weights_scale,
+                static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+                il);
      };
      auto build_attn_block = [&model, this](ggml_tensor *   cur,
                                             ggml_tensor *   inp_pos,
diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp

index 5f64686f5fb014510cf88cb89b43f2c98d2d72f1..3bb9943f49d9a3e9d57a866b9399ea09bab45fee 100644 (file)
--- a/src/models/llada-moe.cpp
+++ b/src/models/llada-moe.cpp
@@ -90,7 +90,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/llama-iswa.cpp b/src/models/llama-iswa.cpp

index 61dd2c179f1dd18d967f91e39670d1030add45c6..40dc2427a8d79cb1926a5da428284895d3d10701 100644 (file)
--- a/src/models/llama-iswa.cpp
+++ b/src/models/llama-iswa.cpp
@@ -134,7 +134,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
                      il);
  
diff --git a/src/models/llama.cpp b/src/models/llama.cpp

index 42b5fcdf42eb80fd5c705bffe0644de229d05cf5..279f2e301aee9ffc33d198f3f23b5b2cfa0d9324 100644 (file)
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -130,7 +130,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/src/models/mimo2-iswa.cpp b/src/models/mimo2-iswa.cpp

index edc87cc9f0d329acd5c64a6f8f42d557f56e5975..06956915ea092670c4d77a9dc4dc05c428a20017 100644 (file)
--- a/src/models/mimo2-iswa.cpp
+++ b/src/models/mimo2-iswa.cpp
@@ -1,4 +1,3 @@
-
  #include "models.h"
  
  llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -88,10 +87,17 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_
              cb(cur, "ffn_out", il);
          } else {
              // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    model.layers[il].ffn_exp_probs_b,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    hparams.expert_weights_scale,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+                    il);
              cb(cur, "ffn_moe_out", il);
          }
  
diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp

index f7001badf75c903aaf4c9e0d3b8780b88ecf1ee0..fbeed8eab9b0644b55dbae0451592d7119b02d02 100644 (file)
--- a/src/models/minimax-m2.cpp
+++ b/src/models/minimax-m2.cpp
@@ -1,4 +1,3 @@
-
  #include "models.h"
  
  llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -91,7 +90,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp

index 0b672235911aeca70bc4f1f8416b824e7b02cd39..49734989df24c298381ace7c6c502b4c179c2052 100644 (file)
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@@ -127,7 +127,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp

index 347f289488cb3f73b13e443ec1f638bb5e888604..fa2b55a28417d2d5fb64c7db8103ea344641db2c 100644 (file)
--- a/src/models/nemotron-h.cpp
+++ b/src/models/nemotron-h.cpp
@@ -124,7 +124,7 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
                      LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
-                    hparams.expert_weights_scale, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp

index b8b6988f897c1aabe0bd4aa549ea3bfe12830430..a05b892c70810e6d7e2f66976f1b3a8de05db83c 100644 (file)
--- a/src/models/olmoe.cpp
+++ b/src/models/olmoe.cpp
@@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp

index dbe3ca1851feae5131c45665ff8a0bdd6fdba10e..403f130bc41f96ca5a279c0ecc72ea320fc09808 100644 (file)
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe-iswa.cpp
@@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SWIGLU_OAI_MOE, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp

index c8e5da33db7e52a6c6299a6ad1fddbef0b3cd04a..803e374aa55e3e1b274174ea2fe4bae9535280bd 100644 (file)
--- a/src/models/phi3.cpp
+++ b/src/models/phi3.cpp
@@ -114,7 +114,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp

index 49142b71236618eb45d78ac9765f26b75ed4e14d..e19061334c01ddc98f50839e41ba1287c965c007 100644 (file)
--- a/src/models/qwen2moe.cpp
+++ b/src/models/qwen2moe.cpp
@@ -94,7 +94,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp

index 17291ec23064c4b712203ad96574f9aa1e071a06..fe081af212f50268fbb8b370de04535d364d8271 100644 (file)
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -375,11 +375,15 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
  
      ggml_tensor * moe_out =
          build_moe_ffn(cur,
-            model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-            model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+            model.layers[il].ffn_gate_inp,
+            model.layers[il].ffn_up_exps,
+            model.layers[il].ffn_gate_exps,
+            model.layers[il].ffn_down_exps,
              nullptr,
-            n_expert, n_expert_used, LLM_FFN_SILU,
-            true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+            n_expert, n_expert_used,
+            LLM_FFN_SILU, true,
+            hparams.expert_weights_scale,
+            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
              nullptr, model.layers[il].ffn_gate_up_exps);
      cb(moe_out, "ffn_moe_out", il);
  
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp

index 888534fb34746e922c41fded972c93d1c4e8927e..9cf1ec03c65dbe896a0f80d4b4f7b5c10b8b8c09 100644 (file)
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -91,7 +91,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp

index f2621200f2337e95547956213438d4222b59bfbc..9b8164ddfab7f2f0700209a89215c4d29b08cc60 100644 (file)
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -475,11 +475,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
          // MoE branch
          ggml_tensor * moe_out =
              build_moe_ffn(cur,
-                model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
                  nullptr,
-                n_expert, n_expert_used, LLM_FFN_SILU,
-                true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true,
+                hparams.expert_weights_scale,
+                LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
                  nullptr, model.layers[il].ffn_gate_up_exps);
          cb(moe_out, "ffn_moe_out", il);
  
diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp

index e5e1a2150c811ad779ceeb3b6fd31a380514a72b..f347c5d6fb035da371d5c89a3dfcf955d8b8ec22 100644 (file)
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@@ -99,7 +99,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp

index 46b3dc3efca22bdd17020ae78012ed16427c6bb1..de9ab1c652558d3cc2ffacb1b1e86449e8f11e5f 100644 (file)
--- a/src/models/rnd1.cpp
+++ b/src/models/rnd1.cpp
@@ -93,7 +93,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp

index 4c497ca76f4295b0b85c7f3a0b4806c4bd900a5d..8723905e839039b9ce191bcb41c72c0b316bba33 100644 (file)
--- a/src/models/smallthinker.cpp
+++ b/src/models/smallthinker.cpp
@@ -93,7 +93,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_RELU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
                      il, probs);
  
diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp

index f8737815a674e8e5d2a0edf03157fd2e4975d365..aa8e98b737015f597d6493f65a36714f102260b1 100644 (file)
--- a/src/models/step35-iswa.cpp
+++ b/src/models/step35-iswa.cpp
@@ -119,9 +119,6 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
              cb(cur, "ffn_out", il);
          } else {
              // MoE routed experts
-            const bool  norm_w  = hparams.expert_weights_norm;
-            const float w_scale = hparams.expert_weights_scale;
-            const bool  scale_w = w_scale != 0.0f;
              ggml_tensor * moe_out = build_moe_ffn(cur,
                      model.layers[il].ffn_gate_inp,
                      model.layers[il].ffn_up_exps,
@@ -129,8 +126,8 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
                      model.layers[il].ffn_down_exps,
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
-                    LLM_FFN_SILU,
-                    norm_w, scale_w, w_scale,
+                    LLM_FFN_SILU, hparams.expert_weights_norm,
+                    hparams.expert_weights_scale,
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
              cb(moe_out, "ffn_moe_out", il);
author	Sigbjørn Skjæret <redacted>
	Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)
committer	GitHub <redacted>
	Sun, 8 Mar 2026 17:58:28 +0000 (18:58 +0100)
src/llama-graph.cpp		patch \| blob \| history
src/llama-graph.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/models/afmoe.cpp		patch \| blob \| history
src/models/arctic.cpp		patch \| blob \| history
src/models/bailingmoe.cpp		patch \| blob \| history
src/models/bailingmoe2.cpp		patch \| blob \| history
src/models/bert.cpp		patch \| blob \| history
src/models/dbrx.cpp		patch \| blob \| history
src/models/deepseek.cpp		patch \| blob \| history
src/models/deepseek2.cpp		patch \| blob \| history
src/models/dots1.cpp		patch \| blob \| history
src/models/ernie4-5-moe.cpp		patch \| blob \| history
src/models/exaone-moe.cpp		patch \| blob \| history
src/models/glm4-moe.cpp		patch \| blob \| history
src/models/granite-hybrid.cpp		patch \| blob \| history
src/models/granite.cpp		patch \| blob \| history
src/models/grok.cpp		patch \| blob \| history
src/models/grovemoe.cpp		patch \| blob \| history
src/models/hunyuan-moe.cpp		patch \| blob \| history
src/models/jamba.cpp		patch \| blob \| history
src/models/kimi-linear.cpp		patch \| blob \| history
src/models/lfm2.cpp		patch \| blob \| history
src/models/llada-moe.cpp		patch \| blob \| history
src/models/llama-iswa.cpp		patch \| blob \| history
src/models/llama.cpp		patch \| blob \| history
src/models/mimo2-iswa.cpp		patch \| blob \| history
src/models/minimax-m2.cpp		patch \| blob \| history
src/models/mistral3.cpp		patch \| blob \| history
src/models/nemotron-h.cpp		patch \| blob \| history
src/models/olmoe.cpp		patch \| blob \| history
src/models/openai-moe-iswa.cpp		patch \| blob \| history
src/models/phi3.cpp		patch \| blob \| history
src/models/qwen2moe.cpp		patch \| blob \| history
src/models/qwen35moe.cpp		patch \| blob \| history
src/models/qwen3moe.cpp		patch \| blob \| history
src/models/qwen3next.cpp		patch \| blob \| history
src/models/qwen3vl-moe.cpp		patch \| blob \| history
src/models/rnd1.cpp		patch \| blob \| history
src/models/smallthinker.cpp		patch \| blob \| history
src/models/step35-iswa.cpp		patch \| blob \| history