ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
}
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
{
const bool is_lite = (hparams.n_layer == 27);
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
+
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+ const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
+ const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
if (!is_lite) {
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
} else {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
}
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
+
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+ if (is_mla) {
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+ } else {
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
ggml_tensor * sa_out = cur;
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, kq_scale, il);
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
cur = build_norm(cur,
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
}
cur = build_norm(cur,
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
cur = build_norm(cur,
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1) {
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
bool is_lite = (hparams.n_layer == 27);
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
+
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
ggml_tensor * cur;
ggml_tensor * inpL;
{
ggml_tensor * q = NULL;
if (!is_lite) {
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);
q = build_norm(q,
- model.layers[il].attn_q_a_norm, NULL,
+ model.layers[il].attn_q_a_norm, nullptr,
LLM_NORM_RMS, il);
cb(q, "q", il);
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
} else {
cb(q, "q", il);
}
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
+ n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
0);
cb(q_nope, "q_nope", il);
- // and {n_head * n_embd_head_qk_rope, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
+ n_embd_head_qk_rope, n_head, n_tokens,
+ ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
// split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
- kv_pe_compresseed->nb[1],
+ ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
+ kv_lora_rank, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
0);
- cb(kv_compressed, "kv_compressed", il);
+ cb(kv_cmpr, "kv_cmpr", il);
+
+ // and {n_embd_head_qk_rope, 1, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
+ n_embd_head_qk_rope, 1, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
- // and {n_embd_head_qk_rope, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
- kv_pe_compresseed->nb[1],
- kv_pe_compresseed->nb[1],
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
+ );
+ cb(q_pe, "q_pe", il);
+
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
+ );
cb(k_pe, "k_pe", il);
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
- kv_compressed = ggml_cont(ctx0, kv_compressed);
- kv_compressed = build_norm(kv_compressed,
- model.layers[il].attn_kv_a_norm, NULL,
+ kv_cmpr = build_norm(kv_cmpr,
+ model.layers[il].attn_kv_a_norm, nullptr,
LLM_NORM_RMS, il);
- cb(kv_compressed, "kv_compressed", il);
+ cb(kv_cmpr, "kv_cmpr", il);
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
- cb(kv, "kv", il);
+ if (is_mla) {
+ // {n_embd_head_qk_nope, n_tokens, n_head}
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+ cb(q_nope, "q_nope_perm", il);
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- 0);
- cb(k_nope, "k_nope", il);
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
- // and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
- cb(v_states, "v_states", il);
+ // {kv_lora_rank, n_head, n_tokens}
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
- v_states = ggml_cont(ctx0, v_states);
- cb(v_states, "v_states", il);
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
+ cb(Qcur, "Qcur", il);
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
- 0);
- cb(v_states, "v_states", il);
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
- q_pe = ggml_rope_ext(
- ctx0, q_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
+ cb(Kcur, "Kcur", il);
- // shared RoPE key
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
- k_pe = ggml_rope_ext(
- ctx0, k_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
+ // {kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Vcur = kv_cmpr;
+ cb(Vcur, "Vcur", il);
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
- cb(q_states, "q_states", il);
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
+ } else {
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
+ cb(kv, "kv", il);
+
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
+ n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
+ 0);
+ cb(k_nope, "k_nope_view", il);
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
- cb(k_states, "k_states", il);
+ // and {n_embd_head_v, n_head, n_tokens}
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
+ n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
+ cb(Vcur, "Vcur_view", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, kq_scale, il);
+ Vcur = ggml_cont(ctx0, Vcur);
+ cb(Vcur, "Vcur_cont", il);
+
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
+ cb(Kcur, "Kcur", il);
+
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+ }
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
NULL, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cur = build_norm(cur,
model.layers[il].attn_sub_norm, NULL,
cur = build_attn(inp_attn, gf,
model.layers[il].wo_enc, nullptr,
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}
cur = build_attn(inp_attn_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}
cur = build_attn(inp_attn_cross, gf,
model.layers[il].wo_cross, nullptr,
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (hparams.swin_norm) {
cur = build_norm(cur,
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, kq_scale, il);
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1) {
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}
if (il == n_layer - 1) {