return cur;
}
-// TODO remove redundant scale_w argument
ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * cur,
ggml_tensor * gate_inp,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
n_expert_used,
type_op,
norm_w,
- scale_w,
w_scale,
gating_op,
il,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
}
- if (scale_w) {
+ if (w_scale != 0.0f && w_scale != 1.0f) {
weights = ggml_scale(ctx0, weights, w_scale);
cb(weights, "ffn_moe_weights_scaled", il);
}
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
switch (hparams.n_ff_exp) {
case 1408: type = LLM_TYPE_16B; break;
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
switch (hparams.n_layer) {
n_expert, n_expert_used,
LLM_FFN_SILU,
hparams.expert_weights_norm, // norm_w (route_norm=True)
- hparams.expert_weights_scale, // scale_w
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
#include "models.h"
-
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
-
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- false, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
// feed-forward network
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
- cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
- model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
- LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ nullptr,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ hparams.n_expert, hparams.n_expert_used,
+ LLM_FFN_GELU, false,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
model.arch == LLM_ARCH_JINA_BERT_V3) {
#include "models.h"
-
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il,
nullptr,
#include "models.h"
-
-
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(routed_out, "ffn_moe_out", il);
#include "models.h"
-
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
llm_build_granite::llm_build_granite(
const llama_model & model,
const llm_graph_params & params)
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_GELU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);
nullptr,
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);
n_expert, n_expert_used,
LLM_FFN_SILU,
true, // norm_topk_prob
- false,
- 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur_moe, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
-#include "ggml.h"
#include "llama-memory-recurrent.h"
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
};
auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
return build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+ il);
};
auto build_attn_block = [&model, this](ggml_tensor * cur,
ggml_tensor * inp_pos,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
-
#include "models.h"
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
cb(cur, "ffn_out", il);
} else {
// MoE branch
- cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
- 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+ il);
cb(cur, "ffn_moe_out", il);
}
-
#include "models.h"
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SWIGLU_OAI_MOE, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
ggml_tensor * moe_out =
build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
nullptr,
- n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
nullptr, model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
// MoE branch
ggml_tensor * moe_out =
build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
nullptr,
- n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
nullptr, model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_RELU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
il, probs);
cb(cur, "ffn_out", il);
} else {
// MoE routed experts
- const bool norm_w = hparams.expert_weights_norm;
- const float w_scale = hparams.expert_weights_scale;
- const bool scale_w = w_scale != 0.0f;
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
- LLM_FFN_SILU,
- norm_w, scale_w, w_scale,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);