const llama_model & model,
const llama_hparams & hparams,
const llama_kv_cache & kv,
+ struct ggml_cgraph * graph,
struct ggml_tensor * wo,
struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur,
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
+ ggml_build_forward_expand(graph, cur);
+
cur = ggml_mul_mat(ctx, wo, cur);
if (wo_b) {
cb(cur, "kqv_wo", il);
return cur;
}
+static struct ggml_tensor * llm_build_kv(
+ struct ggml_context * ctx,
+ const llama_model & model,
+ const llama_hparams & hparams,
+ const llama_kv_cache & kv,
+ struct ggml_cgraph * graph,
+ struct ggml_tensor * wo,
+ struct ggml_tensor * wo_b,
+ struct ggml_tensor * k_cur,
+ struct ggml_tensor * v_cur,
+ struct ggml_tensor * q_cur,
+ struct ggml_tensor * kq_mask,
+ int64_t n_ctx,
+ int32_t n_tokens,
+ int32_t kv_head,
+ int32_t n_kv,
+ float max_alibi_bias,
+ float kq_scale,
+ const llm_build_cb & cb,
+ int il) {
+
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(graph, k_cur);
+ ggml_build_forward_expand(graph, v_cur);
+ ggml_build_forward_expand(graph, q_cur);
+
+ llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
+
+ struct ggml_tensor * cur;
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph,
+ wo, wo_b,
+ q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
+ cb(cur, "kqv_out", il);
+
+ return cur;
+}
+
struct llm_build_context {
const llama_model & model;
const llama_hparams & hparams;
cb(Vcur, "Vcur", il);
}
- // these nodes are added to the graph together so that they are not reordered
- // by doing so, the number of splits in the graph is reduced
- ggml_build_forward_expand(gf, Qcur);
- ggml_build_forward_expand(gf, Kcur);
- ggml_build_forward_expand(gf, Vcur);
-
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
// apply ALiBi for 13B model
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Vcur, "Vcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- // TODO: not tested, could be broken
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * sa_out = cur;
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
ggml_set_name(cur, name);
}
+
+ if (!lctx.cparams.offload_kqv) {
+ if (strcmp(name, "kqv_merged_cont") == 0) {
+ // all nodes between the KV store and the attention output are run on the CPU
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
+ }
+ }
+
//
// allocate input tensors and set input data
//