struct ggml_tensor * cur,
struct ggml_tensor * up,
struct ggml_tensor * up_b,
+ struct ggml_tensor * up_s,
struct ggml_tensor * gate,
struct ggml_tensor * gate_b,
+ struct ggml_tensor * gate_s,
struct ggml_tensor * down,
struct ggml_tensor * down_b,
+ struct ggml_tensor * down_s,
struct ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
cb(tmp, "ffn_up_b", il);
}
+ if (up_s) {
+ tmp = ggml_mul(ctx, tmp, up_s);
+ cb(tmp, "ffn_up_s", il);
+ }
+
if (gate) {
switch (type_gate) {
case LLM_FFN_SEQ:
cur = ggml_add(ctx, cur, gate_b);
cb(cur, "ffn_gate_b", il);
}
+
+ if (gate_s) {
+ cur = ggml_mul(ctx, cur, gate_s);
+ cb(cur, "ffn_gate_s", il);
+ }
+
} else {
cur = tmp;
}
cb(cur, "ffn_gate_par", il);
}
- cur = ggml_mul_mat(ctx, down, cur);
+ if (down) {
+ cur = ggml_mul_mat(ctx, down, cur);
+ }
+
if (down_b) {
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx, cur, down_b);
}
+ if (down_s) {
+ cur = ggml_mul(ctx, cur, down_s);
+ cb(cur, "ffn_down_s", il);
+ }
+
return cur;
}
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// feed forward
{
cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
- model.layers[il].ffn_up, NULL,
- NULL, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// feed-forward network
if (model.arch == LLM_ARCH_BERT) {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
} else {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
}
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
LLM_NORM, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_act,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cur = inpSA;
}
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur_gate, "ffn_shexp_gate", il);
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up_shexp, NULL,
- model.layers[il].ffn_gate_shexp, NULL,
- model.layers[il].ffn_down_shexp, NULL,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur_ffn, "ffn_shexp", il);
// FF
{
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(ffn_output, "ffn_out", il);
// feed-forward network
{
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// feed-forward network
{
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
// feed-forward network
{
cur = llm_build_ffn(ctx0, ffn_inp,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// FFN shared expert
{
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up_shexp, NULL,
- model.layers[il].ffn_gate_shexp, NULL,
- model.layers[il].ffn_down_shexp, NULL,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(ffn_shexp, "ffn_shexp", il);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
- nullptr, nullptr,
+ NULL, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cur = llm_build_norm(ctx0, cur, hparams,
cb(ffn_inp, "ffn_inp", il);
// feed-forward forward
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
-
- struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
- tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
- cb(tmp, "ffn_up", il);
-
- cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
- cb(cur, "ffn_gate", il);
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
- cur = ggml_silu(ctx0, cur);
- cb(cur, "ffn_silu", il);
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+ NULL, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_sub_out", il);
- cur = ggml_mul(ctx0, cur, tmp);
- cb(cur, "ffn_gate_par", il);
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_sub_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_sub_norm", il);
- cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].ffn_sub_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_sub_norm", il);
+ cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+ cb(cur, "ffn_down", il);
- cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
- cb(cur, "ffn_down", il);
- }
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il);