ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w,
- ggml_tensor * cur) const {
+ ggml_tensor * cur,
+ ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
for (const auto & lora : *loras) {
res = ggml_add(ctx0, res, ab_cur);
}
+ if (w_s) {
+ res = ggml_mul(ctx0, res, w_s);
+ }
+
return res;
}
ggml_tensor * cur,
int il) const;
- // do mat_mul, while optionally apply lora
+ // do mat_mul, while optionally apply lora and per-tensor scale
ggml_tensor * build_lora_mm(
ggml_tensor * w,
- ggml_tensor * cur) const;
+ ggml_tensor * cur,
+ ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
ggml_tensor * build_lora_mm_id(
// self-attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_s) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
- }
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
}
// B1.K
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_s) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
- }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
}
// B1.V
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_s) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
LLM_NORM_RMS, il);
cb(cur, "attn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].wo, cur);
- if (model.layers[il].wo_s) {
- cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
- }
+ cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
LLM_NORM_RMS, il);
cb(cur, "ffn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].ffn_down, cur);
- if (model.layers[il].ffn_down_s) {
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_s);
- }
+ cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
cb(cur, "ffn_down", il);
cur = ggml_add(ctx0, cur, ffn_inp);
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_s) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
- }
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_s) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
- }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_s) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
// self-attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_s) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
- }
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_s) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
- }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_s) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
// self_attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_s) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
- }
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_s) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
- }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_s) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);