const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "attn_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "attn_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
const int64_t n_head_kv = hparams.n_head_kv(il);
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * attn_norm;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
inpL = ggml_add(ctx0, inpL, pos);
cb(inpL, "inpL", -1);
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_no_cache();
- // iterate layers
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * cur = inpL;
- ggml_tensor * Qcur;
- ggml_tensor * Kcur;
- ggml_tensor * Vcur;
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
- // self-attention
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
+ // self-attention
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
- }
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+ } else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
+ }
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
- }
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm,
+ model.layers[il].attn_q_norm_b,
+ LLM_NORM, il);
+ }
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
- }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm,
+ model.layers[il].attn_k_norm_b,
+ LLM_NORM, il);
+ }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- // RoPE
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
+ // RoPE
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "kqv_out", il);
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_no_cache();
- // iterate layers
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * cur = inpL;
- ggml_tensor * Qcur;
- ggml_tensor * Kcur;
- ggml_tensor * Vcur;
-
// pre-norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
- // self-attention
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- // RoPE
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
+ // self-attention
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "kqv_out", il);
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
LLM_NORM, -1);
cb(inpL, "inp_norm", -1);
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
cb(inpL, "inpL", -1);
}
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * attn_norm;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
attn_norm_output = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
inp_attn = build_attn_inp_kv_unified();
}
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
auto * residual = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor* inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
- ggml_tensor * attention_norm = cur;
+ ggml_tensor * sa_inp = cur;
// self-attention
{
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- ggml_tensor * sa_out = cur;
-
- cur = attention_norm;
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
+ ggml_tensor * sa_out = cur;
+
+ cur = sa_inp;
+
// feed-forward network
{
cur = build_ffn(cur,
inpL = ggml_add(ctx0, inpL, pos);
cb(inpL, "inpL", -1);
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
struct llm_build_orion : public llm_graph_context {
llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
+ inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- // if (model.layers[il].bq) {
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- // cb(Qcur, "Qcur", il);
- // }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- // if (model.layers[il].bk) {
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- // cb(Kcur, "Kcur", il);
- // }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- // if (model.layers[il].bv) {
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- // cb(Vcur, "Vcur", il);
- // }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ // if (model.layers[il].bq) {
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ // cb(Qcur, "Qcur", il);
+ // }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ // if (model.layers[il].bk) {
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ // cb(Kcur, "Kcur", il);
+ // }
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ // if (model.layers[il].bv) {
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ // cb(Vcur, "Vcur", il);
+ // }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
- cur = ggml_add(ctx0, cur, ffn_inp);
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
- // input for next layer
- inpL = cur;
- }
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
- cur = inpL;
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
+ ggml_build_forward_expand(gf, cur);
}
};
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
- const float scale_res = scale_depth/sqrtf(float(n_layer));
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", il);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
cur = build_norm(cur,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
cb(sa_out, "sa_out", il);
// TODO: is causal == true correct? might need some changes
auto * inp_attn = build_attn_inp_kv_unified_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
cur = build_norm(cur,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
cb(sa_out, "sa_out", il);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * rs_inp = build_rs_inp();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ for (int il = 0; il < n_layer; ++il) {
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM, il);
cb(cur, "attn_norm", il);
+
ggml_tensor * ffn_inp = cur;
// self-attention
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
auto * inp_attn = build_attn_inp_kv_unified_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.is_swa(il);
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
cur = build_norm(cur,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const int64_t n_head = hparams.n_head(il);
const int64_t n_head_kv = hparams.n_head_kv(il);
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
}
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "attn_o_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_no_cache();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "kqv_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn_self = build_attn_inp_kv_unified();
auto * inp_attn_cross = build_attn_inp_cross();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
//cb(cur, "kqv_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL,
model.layers[il].attn_norm,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
const auto n_seq_tokens = ubatch.n_seq_tokens;
const auto n_seqs = ubatch.n_seqs;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
);
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
}
cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
const auto n_seq_tokens = ubatch.n_seq_tokens;
const auto n_seqs = ubatch.n_seqs;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
}
// feed-forward network
const auto n_seq_tokens = ubatch.n_seq_tokens;
const auto n_seqs = ubatch.n_seqs;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
);
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
}
cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
const auto n_seq_tokens = ubatch.n_seq_tokens;
const auto n_seqs = ubatch.n_seqs;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
}
// feed-forward network
auto * inp_attn = build_attn_inp_kv_unified();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "attn_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cur = build_attn(inp_attn, gf,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
- if (hparams.swin_norm) {
- cur = build_norm(cur,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- }
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
+ if (hparams.swin_norm) {
+ cur = build_norm(cur,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ }
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
auto * inp_attn = build_attn_inp_kv_unified();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cb(cur, "attn_out", il);
}
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
+ if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}