}
// recurrent / linear-attention weight scales (per-tensor, shape {1})
+ if (!layer.ssm_in_s && layer.ssm_in) {
+ layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
if (!layer.ssm_out_s && layer.ssm_out) {
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
struct ggml_tensor * ffn_up_shexp_s = nullptr;
struct ggml_tensor * ffn_down_shexp_s = nullptr;
- struct ggml_tensor * ssm_out_s = nullptr;
+ struct ggml_tensor * ssm_in_s = nullptr;
+ struct ggml_tensor * ssm_out_s = nullptr;
struct ggml_tensor * ssm_alpha_s = nullptr;
struct ggml_tensor * ssm_beta_s = nullptr;
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s);
// split the above in two
// => {d_inner, n_seq_tokens, n_seqs}
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(layer.ssm_out, y);
+ cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s);
}
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
- ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s);
// split the above in three
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(model.layers[il].ssm_out, y);
+ cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s);
}
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il,
- router_logits);
+ router_logits, nullptr,
+ model.layers[il].ffn_up_exps_s,
+ nullptr, // no gate
+ model.layers[il].ffn_down_exps_s);
cb(moe_out, "ffn_moe_out", il);
if (model.layers[il].ffn_latent_up) {
}
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- NULL /* no gate */ , NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
+ model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
+ NULL /* no gate */ , NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);