ggml_type type_v,
bool v_trans,
bool offload,
+ bool unified,
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type) :
model(model), hparams(model.hparams), v_trans(v_trans),
- n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+ n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % n_pad == 0);
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
- /*.mem_size =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()),
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
return it->second;
};
- head = 0;
+ GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
- cells.resize(kv_size);
+ v_heads.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_heads[s] = 0;
+ }
+
+ v_cells.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_cells[s].resize(kv_size);
+ }
+
+ // by default, all sequence ids are mapped to the 0th stream
+ seq_to_stream.resize(LLAMA_MAX_SEQ, 0);
+
+ if (n_stream > 1) {
+ seq_to_stream.resize(n_stream, 0);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ seq_to_stream[s] = s;
+ }
+ }
+
+ // [TAG_V_CACHE_VARIABLE]
+ if (v_trans && hparams.is_n_embd_v_gqa_variable()) {
+ LLAMA_LOG_WARN("%s: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to %d\n",
+ __func__, hparams.n_embd_v_gqa_max());
+ }
for (uint32_t il = 0; il < n_layer_cache; il++) {
if (filter && !filter(il)) {
continue;
}
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ // [TAG_V_CACHE_VARIABLE]
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
const char * dev_name = "CPU";
ggml_tensor * k;
ggml_tensor * v;
- k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
- v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
+ k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
+ v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
ggml_format_name(k, "cache_k_l%d", il);
ggml_format_name(v, "cache_v_l%d", il);
+ std::vector<ggml_tensor *> k_stream;
+ std::vector<ggml_tensor *> v_stream;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
+ v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
+ }
+
map_layer_ids[il] = layers.size();
- layers.push_back({ il, k, v });
+
+ layers.push_back({ il, k, v, k_stream, v_stream, });
}
// TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
const size_t memory_size_k = size_k_bytes();
const size_t memory_size_v = size_v_bytes();
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
+ if (!supports_set_rows) {
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+ GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
+ }
+
if (!supports_set_rows) {
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
}
}
void llama_kv_cache_unified::clear(bool data) {
- cells.reset();
-
- head = 0;
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_cells[s].reset();
+ v_heads[s] = 0;
+ }
if (data) {
for (auto & buf : bufs) {
}
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
uint32_t new_head = cells.size();
if (p0 < 0) {
}
void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
- if (seq_id_src == seq_id_dst) {
+ GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
+ GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
+
+ const auto s0 = seq_to_stream[seq_id_src];
+ const auto s1 = seq_to_stream[seq_id_dst];
+
+ if (s0 == s1) {
+ // since both sequences are in the same stream, no data copy is necessary
+ // we just have to update the cells meta data
+
+ auto & cells = v_cells[s0];
+
+ if (seq_id_src == seq_id_dst) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ if (cells.seq_has(i, seq_id_src)) {
+ cells.seq_add(i, seq_id_dst);
+ }
+ }
+
return;
}
- if (p0 < 0) {
- p0 = 0;
+ // cross-stream sequence copies require to copy the actual buffer data
+
+ bool is_full = true;
+
+ if (p0 > 0 && p0 + 1 < (int) get_size()) {
+ is_full = false;
}
- if (p1 < 0) {
- p1 = std::numeric_limits<llama_pos>::max();
+ if (p1 > 0 && p1 + 1 < (int) get_size()) {
+ is_full = false;
}
- for (uint32_t i = 0; i < cells.size(); ++i) {
- if (!cells.pos_in(i, p0, p1)) {
- continue;
- }
+ GGML_ASSERT(is_full && "seq_cp() is only supported for full KV buffers");
+
+ // enqueue the copy operation - the buffer copy will be performed during the next update
+ sc_info.ssrc.push_back(s0);
+ sc_info.sdst.push_back(s1);
+
+ v_cells[s1].reset();
+ for (uint32_t i = 0; i < v_cells[s0].size(); ++i) {
+ if (v_cells[s0].seq_has(i, seq_id_src)) {
+ llama_pos pos = v_cells[s0].pos_get(i);
+ llama_pos shift = v_cells[s0].get_shift(i);
+
+ if (shift != 0) {
+ pos -= shift;
+ assert(pos >= 0);
+ }
+
+ v_cells[s1].pos_set(i, pos);
+ v_cells[s1].seq_add(i, seq_id_dst);
- if (cells.seq_has(i, seq_id_src)) {
- cells.seq_add(i, seq_id_dst);
+ if (shift != 0) {
+ v_cells[s1].pos_add(i, shift);
+ }
}
}
+
+ v_heads[s1] = v_heads[s0];
+
+ //for (uint32_t s = 0; s < n_stream; ++s) {
+ // LLAMA_LOG_WARN("%s: seq %d: min = %d, max = %d\n", __func__, s, v_cells[s].seq_pos_min(s), v_cells[s].seq_pos_max(s));
+ //}
}
void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
uint32_t new_head = cells.size();
for (uint32_t i = 0; i < cells.size(); ++i) {
}
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
if (shift == 0) {
return;
}
}
void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+
if (d == 1) {
return;
}
}
llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
+
return cells.seq_pos_min(seq_id);
}
llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
+
return cells.seq_pos_max(seq_id);
}
std::vector<llama_ubatch> ubatches;
while (true) {
- auto ubatch = balloc.split_simple(n_ubatch);
+ auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
if (ubatch.n_tokens == 0) {
break;
defrag_info dinfo;
// see if we need to defrag
- {
+ if (n_stream == 1) {
+ // note : for now do not consider defrag for n_stream > 1
+ const auto & cells = v_cells[seq_to_stream[0]];
+
bool do_defrag = optimize;
const auto thold = lctx->get_cparams().defrag_thold;
}
}
- return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo));
+ return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo), std::move(sc_info));
}
llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
llama_kv_cache_unified::slot_info_vec_t res;
- struct state {
- uint32_t head_old; // old position of the head, before placing the ubatch
-
+ struct state_t {
slot_info sinfo; // slot info for the ubatch
- llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
+ std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
+
+ std::vector<llama_kv_cells_unified> v_cells; // copy of the old cells, before placing the ubatch
};
// remember the old state of the cells so we can restore it in the end
- std::vector<state> states;
+ std::vector<state_t> states;
bool success = true;
res.push_back(sinfo_new);
// store the old state of the cells in the recovery stack
- states.push_back({head, sinfo_new, cells.cp(sinfo_new.idxs)});
+ {
+ state_t state = { sinfo_new, v_heads, {} };
+
+ for (uint32_t s = 0; s < sinfo_new.n_stream(); ++s) {
+ auto & cells = v_cells[sinfo_new.strm[s]];
+
+ state.v_cells.push_back(cells.cp(sinfo_new.idxs[s]));
+ }
+
+ states.push_back(std::move(state));
+ }
// now emplace the ubatch
apply_ubatch(sinfo_new, ubatch);
}
+ GGML_ASSERT(!states.empty() || !success);
+
// iterate backwards and restore the cells to their original state
for (auto it = states.rbegin(); it != states.rend(); ++it) {
- cells.set(it->sinfo.idxs, it->cells);
- head = it->head_old;
+ const auto & sinfo = it->sinfo;
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ auto & cells = v_cells[sinfo.strm[s]];
+ auto & head = v_heads[sinfo.strm[s]];
+
+ cells.set(sinfo.idxs[s], it->v_cells[s]);
+ head = it->v_heads_old[s];
+ }
}
if (!success) {
return res;
}
-bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo) {
+bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) {
bool updated = false;
auto * sched = lctx->get_sched();
+ if (!sc_info.empty()) {
+ assert(n_stream > 1 && "stream copy should never happen with a single stream");
+
+ llama_synchronize(lctx);
+
+ const size_t n_copy = sc_info.ssrc.size();
+
+ for (size_t i = 0; i < n_copy; ++i) {
+ const auto ssrc = sc_info.ssrc[i];
+ const auto sdst = sc_info.sdst[i];
+
+ assert(ssrc < n_stream);
+ assert(sdst < n_stream);
+
+ LLAMA_LOG_DEBUG("%s: copying KV buffer: stream %d to stream %d\n", __func__, ssrc, sdst);
+
+ assert(ssrc != sdst);
+
+ for (uint32_t il = 0; il < layers.size(); ++il) {
+ const auto & layer = layers[il];
+
+ ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
+ ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
+ }
+ }
+ }
+
if (do_shift) {
if (!get_can_shift()) {
GGML_ABORT("The current KV cache / model configuration does not support K-shift");
updated = true;
}
- cells.reset_shift();
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ auto & cells = v_cells[s];
+
+ cells.reset_shift();
+ }
}
if (!dinfo.empty()) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ // note: for now do not consider defrag for n_stream > 1
+ auto & cells = v_cells[seq_to_stream[0]];
+ auto & head = v_heads[seq_to_stream[0]];
+
// apply moves:
{
const auto n_kv = dinfo.ids.size();
}
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
- const uint32_t n_tokens = ubatch.n_tokens;
-
- uint32_t head_cur = this->head;
-
- // if we have enough unused cells before the current head ->
- // better to start searching from the beginning of the cache, hoping to fill it
- if (head_cur > cells.get_used() + 2*ubatch.n_tokens) {
- head_cur = 0;
- }
+ if (debug > 0) {
+ const auto & cells = v_cells[seq_to_stream[1]];
- if (n_tokens > cells.size()) {
- LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
- return { };
- }
+ const uint32_t head_cur = v_heads[1];
- if (debug > 0) {
- LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
+ LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
+ __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
}
}
- uint32_t n_tested = 0;
+ uint32_t n_tokens = ubatch.n_tokens;
+ uint32_t n_seqs = 1;
- // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
- // for non-continuous slots, we test the tokens one by one
- const uint32_t n_test = cont ? n_tokens : 1;
+ if (n_stream > 1) {
+ GGML_ASSERT(n_tokens % ubatch.n_seqs_unq == 0);
- slot_info res;
+ n_seqs = ubatch.n_seqs_unq;
+ n_tokens = n_tokens / n_seqs;
+ }
+
+ slot_info res = {
+ /*.s0 =*/ LLAMA_MAX_SEQ,
+ /*.s1 =*/ 0,
+ /*.strm =*/ { },
+ /*.idxs =*/ { },
+ };
+
+ res.resize(n_seqs);
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const auto seq_id = ubatch.seq_id_unq[s];
+
+ if (n_stream > 1) {
+ GGML_ASSERT(ubatch.n_seq_id[s*n_tokens] == 1);
+ GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
+ }
+
+ res.s0 = std::min<llama_seq_id>(res.s0, seq_to_stream[seq_id]);
+ res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);
+
+ res.strm[s] = seq_to_stream[seq_id];
+ res.idxs[s].reserve(n_tokens);
- auto & idxs = res.idxs;
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
- idxs.reserve(n_tokens);
+ uint32_t head_cur = v_heads[seq_to_stream[seq_id]];
- while (true) {
- if (head_cur + n_test > cells.size()) {
- n_tested += cells.size() - head_cur;
+ // if we have enough unused cells before the current head ->
+ // better to start searching from the beginning of the cache, hoping to fill it
+ if (head_cur > cells.get_used() + 2*n_tokens) {
head_cur = 0;
- continue;
}
- for (uint32_t i = 0; i < n_test; i++) {
- const auto idx = head_cur;
+ if (n_tokens > cells.size()) {
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
+ return { };
+ }
+
+ uint32_t n_tested = 0;
+
+ // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
+ // for non-continuous slots, we test the tokens one by one
+ const uint32_t n_test = cont ? n_tokens : 1;
+
+ while (true) {
+ if (head_cur + n_test > cells.size()) {
+ n_tested += cells.size() - head_cur;
+ head_cur = 0;
+ continue;
+ }
+
+ for (uint32_t i = 0; i < n_test; i++) {
+ const auto idx = head_cur;
+
+ head_cur++;
+ n_tested++;
- //const llama_pos pos = ubatch.pos[i];
- //const llama_seq_id seq_id = ubatch.seq_id[i][0];
+ //const llama_pos pos = ubatch.pos[i];
+ //const llama_seq_id seq_id = ubatch.seq_id[i][0];
- // can we use this cell? either:
- // - the cell is empty
- // - the cell is occupied only by one sequence:
- // - (disabled) mask causally, if the sequence is the same as the one we are inserting
- // - mask SWA, using current max pos for that sequence in the cache
- // always insert in the cell with minimum pos
- bool can_use = cells.is_empty(idx);
+ // can we use this cell? either:
+ // - the cell is empty
+ // - the cell is occupied only by one sequence:
+ // - (disabled) mask causally, if the sequence is the same as the one we are inserting
+ // - mask SWA, using current max pos for that sequence in the cache
+ // always insert in the cell with minimum pos
+ bool can_use = cells.is_empty(idx);
- if (!can_use && cells.seq_count(idx) == 1) {
- const llama_pos pos_cell = cells.pos_get(idx);
+ if (!can_use && cells.seq_count(idx) == 1) {
+ const llama_pos pos_cell = cells.pos_get(idx);
- // (disabled) causal mask
- // note: it's better to purge any "future" tokens beforehand
- //if (cells.seq_has(idx, seq_id)) {
- // can_use = pos_cell >= pos;
- //}
+ // (disabled) causal mask
+ // note: it's better to purge any "future" tokens beforehand
+ //if (cells.seq_has(idx, seq_id)) {
+ // can_use = pos_cell >= pos;
+ //}
- if (!can_use) {
- const llama_seq_id seq_id_cell = cells.seq_get(idx);
+ if (!can_use) {
+ const llama_seq_id seq_id_cell = cells.seq_get(idx);
- // SWA mask
- if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
- can_use = true;
+ // SWA mask
+ if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+ can_use = true;
+ }
}
}
- }
- head_cur++;
- n_tested++;
+ if (can_use) {
+ res.idxs[s].push_back(idx);
+ } else {
+ if (cont) {
+ break;
+ }
+ }
+ }
- if (can_use) {
- idxs.push_back(idx);
- } else {
+ if (res.idxs[s].size() == n_tokens) {
break;
}
- }
- if (idxs.size() == n_tokens) {
- break;
- }
+ if (cont) {
+ res.idxs[s].clear();
+ }
- if (cont) {
- idxs.clear();
+ if (n_tested >= cells.size()) {
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+ return { };
+ }
}
- if (n_tested >= cells.size()) {
- //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+ // we didn't find a suitable slot - return empty result
+ if (res.idxs[s].size() < n_tokens) {
return { };
}
}
- // we didn't find a suitable slot - return empty result
- if (idxs.size() < n_tokens) {
- res.clear();
- }
+ assert(res.s1 >= res.s0);
return res;
}
// keep track of the max sequence position that we would overwrite with this ubatch
// for non-SWA cache, this would be always empty
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
- for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
seq_pos_max_rm[s] = -1;
}
- assert(ubatch.n_tokens == sinfo.idxs.size());
+ assert(ubatch.n_tokens == sinfo.n_stream()*sinfo.size());
- for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
- const auto idx = sinfo.idxs.at(i);
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ for (uint32_t ii = 0; ii < sinfo.size(); ++ii) {
+ const uint32_t i = s*sinfo.size() + ii;
- if (!cells.is_empty(idx)) {
- assert(cells.seq_count(idx) == 1);
+ auto & cells = v_cells[sinfo.strm[s]];
- const llama_seq_id seq_id = cells.seq_get(idx);
- const llama_pos pos = cells.pos_get(idx);
+ const auto idx = sinfo.idxs[s][ii];
- seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
+ if (!cells.is_empty(idx)) {
+ assert(cells.seq_count(idx) == 1);
- cells.rm(idx);
- }
+ const llama_seq_id seq_id = cells.seq_get(idx);
+ const llama_pos pos = cells.pos_get(idx);
- cells.pos_set(idx, ubatch.pos[i]);
+ seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
- for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
- cells.seq_add(idx, ubatch.seq_id[i][s]);
+ cells.rm(idx);
+ }
+
+ cells.pos_set(idx, ubatch.pos[i]);
+
+ for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
+ cells.seq_add(idx, ubatch.seq_id[i][s]);
+ }
}
}
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
- for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
if (seq_pos_max_rm[s] == -1) {
continue;
}
+ GGML_ASSERT(s < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[s]];
+
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
}
// move the head at the end of the slot
- head = sinfo.idxs.back() + 1;
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ auto & head = v_heads[sinfo.strm[s]];
+
+ head = sinfo.idxs[s].back() + 1;
+ }
}
bool llama_kv_cache_unified::get_can_shift() const {
}
uint32_t llama_kv_cache_unified::get_size() const {
+ const auto & cells = v_cells[seq_to_stream[0]];
+
return cells.size();
}
+uint32_t llama_kv_cache_unified::get_n_stream() const {
+ return n_stream;
+}
+
bool llama_kv_cache_unified::get_has_shift() const {
- return cells.get_has_shift();
+ bool result = false;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ result |= v_cells[s].get_has_shift();
+ }
+
+ return result;
}
uint32_t llama_kv_cache_unified::get_n_kv() const {
- return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
+ uint32_t result = 0;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ const auto & cells = v_cells[s];
+
+ result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
+ }
+
+ return result;
}
-ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
const int32_t ikv = map_layer_ids.at(il);
auto * k = layers[ikv].k;
- return ggml_view_3d(ctx, k,
- hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv,
+ const uint64_t kv_size = get_size();
+ const uint64_t n_embd_k_gqa = k->ne[0];
+
+ assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
+
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
+
+ return ggml_view_4d(ctx, k,
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
ggml_row_size(k->type, hparams.n_embd_head_k),
- ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
- 0);
+ ggml_row_size(k->type, n_embd_k_gqa),
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size),
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
}
-ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
const int32_t ikv = map_layer_ids.at(il);
auto * v = layers[ikv].v;
+ const uint64_t kv_size = get_size();
+ const uint64_t n_embd_v_gqa = v->ne[0];
+
+ // [TAG_V_CACHE_VARIABLE]
+ assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il));
+
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
+
if (!v_trans) {
// note: v->nb[1] <= v->nb[2]
- return ggml_view_3d(ctx, v,
- hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv,
- ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
- ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
- 0);
+ return ggml_view_4d(ctx, v,
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
}
// note: v->nb[1] > v->nb[2]
- return ggml_view_3d(ctx, v,
- n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v,
- ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
- ggml_row_size(v->type, v->ne[1]), // v->nb[2]
- 0);
+ return ggml_view_4d(ctx, v,
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, kv_size), // v->nb[2]
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
}
ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
if (k_idxs && supports_set_rows) {
+ if (k->ne[2] > 1) {
+ k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
+ }
+
return ggml_set_rows(ctx, k, k_cur, k_idxs);
}
// TODO: fallback to old ggml_cpy() method for backwards compatibility
// will be removed when ggml_set_rows() is adopted by all backends
+ GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
+
ggml_tensor * k_view = ggml_view_1d(ctx, k,
n_tokens*n_embd_k_gqa,
ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
auto * v = layers[ikv].v;
- const int64_t n_embd_v_gqa = v->ne[0];
- const int64_t n_tokens = v_cur->ne[2];
+ const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
+ const int64_t n_tokens = v_cur->ne[2];
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
if (v_idxs && supports_set_rows) {
if (!v_trans) {
+ if (v->ne[2] > 1) {
+ v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
+ }
+
return ggml_set_rows(ctx, v, v_cur, v_idxs);
}
- // the row becomes a single element
- ggml_tensor * v_view = ggml_reshape_3d(ctx, v, 1, v->ne[1], v->ne[0]);
+ // [TAG_V_CACHE_VARIABLE]
+ if (n_embd_v_gqa < v->ne[0]) {
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
+ }
- // note: the V cache is transposed when not using flash attention
- v_cur = ggml_permute(ctx, ggml_reshape_3d(ctx, v_cur, v_cur->ne[0], 1, v_cur->ne[1]), 2, 0, 1, 3);
+ // the row becomes a single element
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
- // note: we can be more explicit here at the cost of extra cont
- // however, above we take advantage that a row of single element is always continuous regardless of the row stride
- //v_cur = ggml_transpose(ctx, v_cur);
- //v_cur = ggml_cont_3d(ctx, v_cur, 1, v_cur->ne[0], v_cur->ne[1]);
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
- // we broadcast the KV indices n_embd_v_gqa times
- // v [1, n_kv, n_embd_v_gqa]
- // v_cur [1, n_tokens, n_embd_v_gqa]
- // v_idxs [n_tokens, 1, 1]
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
}
// TODO: fallback to old ggml_cpy() method for backwards compatibility
// will be removed when ggml_set_rows() is adopted by all backends
+ GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
+
ggml_tensor * v_view = nullptr;
if (!v_trans) {
ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
const uint32_t n_tokens = ubatch.n_tokens;
- ggml_tensor * v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+ ggml_tensor * v_idxs;
+
+ if (!v_trans) {
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+ } else {
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa_max());
+ }
ggml_set_input(v_idxs);
}
const uint32_t n_tokens = ubatch->n_tokens;
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
int64_t * data = (int64_t *) dst->data;
- for (int64_t i = 0; i < n_tokens; ++i) {
- data[i] = sinfo.idxs.at(i);
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*get_size();
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
+ }
}
}
}
const uint32_t n_tokens = ubatch->n_tokens;
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
int64_t * data = (int64_t *) dst->data;
- for (int64_t i = 0; i < n_tokens; ++i) {
- data[i] = sinfo.idxs.at(i);
+ if (!v_trans) {
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*get_size();
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
+ }
+ }
+ } else {
+ // note: the V cache is transposed when not using flash attention
+ const int64_t kv_size = get_size();
+
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa_max();
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*kv_size*n_embd_v_gqa;
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs[s][i];
+ }
+ }
+ }
+ }
+}
+
+void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+ int32_t * data = (int32_t *) dst->data;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ const auto & cells = v_cells[s];
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
+ }
}
}
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
float * data = (float *) dst->data;
- const int64_t n_kv = dst->ne[0];
+ const int64_t n_kv = dst->ne[0];
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
+
+ GGML_ASSERT(n_tokens%n_stream == 0);
+
+ // n_tps == n_tokens_per_stream
+ const int64_t n_tps = n_tokens/n_stream;
+ const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
// xxxxx-----
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
for (uint32_t h = 0; h < 1; ++h) {
- for (uint32_t i = 0; i < n_tokens; ++i) {
- const llama_seq_id seq_id = ubatch->seq_id[i][0];
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
+ const uint32_t i = s*n_tps + ii;
- const llama_pos p1 = ubatch->pos[i];
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
- for (uint32_t j = 0; j < n_kv; ++j) {
- float f = 0.0f;
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
- bool masked = false;
+ const llama_pos p1 = ubatch->pos[i];
- if (cells.is_empty(j)) {
- masked = true;
- } else {
- const llama_pos p0 = cells.pos_get(j);
+ for (uint32_t j = 0; j < n_kv; ++j) {
+ float f = 0.0f;
- // mask the token if not the same sequence
- masked = masked || (!cells.seq_has(j, seq_id));
+ bool masked = false;
- // mask future tokens
- masked = masked || (causal_attn && p0 > p1);
+ if (cells.is_empty(j)) {
+ masked = true;
+ } else {
+ const llama_pos p0 = cells.pos_get(j);
+
+ // mask the token if not the same sequence
+ masked = masked || (!cells.seq_has(j, seq_id));
+
+ // mask future tokens
+ masked = masked || (causal_attn && p0 > p1);
- // apply SWA if any
- masked = masked || (is_masked_swa(p0, p1));
+ // apply SWA if any
+ masked = masked || (is_masked_swa(p0, p1));
- if (!masked && hparams.use_alibi) {
- f = -std::abs(p0 - p1);
+ if (!masked && hparams.use_alibi) {
+ f = -std::abs(p0 - p1);
+ }
}
- }
- if (masked) {
- f = -INFINITY;
- }
+ if (masked) {
+ f = -INFINITY;
+ }
- data[h*(n_kv*n_tokens) + i*n_kv + j] = f;
- }
- }
+ data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = f;
+ }
- // mask padded tokens
- if (data) {
- for (uint32_t i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
- for (uint32_t j = 0; j < n_kv; ++j) {
- data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+ // mask padded tokens
+ if (data) {
+ for (uint32_t ii = n_tps; ii < n_tps_pad; ++ii) {
+ for (uint32_t j = 0; j < n_kv; ++j) {
+ data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = -INFINITY;
+ }
+ }
}
}
}
}
}
-void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
- GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
- int32_t * data = (int32_t *) dst->data;
-
- for (uint32_t i = 0; i < cells.size(); ++i) {
- data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
- }
-}
-
void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
const int64_t n_tokens = ubatch->n_tokens;
+ GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
+ const auto & cells = v_cells[0];
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
void set_input(const llama_ubatch * ubatch) override;
- ggml_tensor * k_shift; // I32 [kv_size]
+ ggml_tensor * k_shift; // I32 [kv_size*n_stream]
const llama_kv_cache_unified * kv_self;
};
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
- inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
+ inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
ggml_set_input(inp->k_shift);
for (const auto & layer : layers) {
ggml_tensor * k =
ggml_view_3d(ctx, layer.k,
- n_embd_head_k, n_head_kv, cells.size(),
+ n_embd_head_k, n_head_kv, get_size()*n_stream,
ggml_row_size(layer.k->type, n_embd_head_k),
ggml_row_size(layer.k->type, n_embd_k_gqa),
0);
const defrag_info & dinfo) const {
auto res = std::make_unique<llm_graph_result>();
+ GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
+
+ const auto & cells = v_cells[0];
+
const auto & ids = dinfo.ids;
#if 0
}
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
+ GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
+
+ const auto & cells = v_cells[0];
+
const uint32_t n_layer = layers.size();
const uint32_t n_kv = cells.used_max_p1();
}
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
- std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
- uint32_t cell_count = 0;
+ io.write(&n_stream, sizeof(n_stream));
- // Count the number of cells with the specified seq_id
- // Find all the ranges of cells with this seq id (or all, when -1)
- uint32_t cell_range_begin = cells.size();
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ cell_ranges_t cr { s, {} };
- for (uint32_t i = 0; i < cells.size(); ++i) {
- if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
- ++cell_count;
- if (cell_range_begin == cells.size()) {
- cell_range_begin = i;
- }
- } else {
- if (cell_range_begin != cells.size()) {
- cell_ranges.emplace_back(cell_range_begin, i);
- cell_range_begin = cells.size();
+ uint32_t cell_count = 0;
+
+ const auto & cells = v_cells[s];
+
+ // Count the number of cells with the specified seq_id
+ // Find all the ranges of cells with this seq id (or all, when -1)
+ uint32_t cell_range_begin = cells.size();
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+ ++cell_count;
+ if (cell_range_begin == cells.size()) {
+ cell_range_begin = i;
+ }
+ } else {
+ if (cell_range_begin != cells.size()) {
+ cr.data.emplace_back(cell_range_begin, i);
+ cell_range_begin = cells.size();
+ }
}
}
- }
- if (cell_range_begin != cells.size()) {
- cell_ranges.emplace_back(cell_range_begin, cells.size());
- }
+ if (cell_range_begin != cells.size()) {
+ cr.data.emplace_back(cell_range_begin, cells.size());
+ }
- // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
- uint32_t cell_count_check = 0;
- for (const auto & range : cell_ranges) {
- cell_count_check += range.second - range.first;
- }
- GGML_ASSERT(cell_count == cell_count_check);
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+ uint32_t cell_count_check = 0;
+ for (const auto & range : cr.data) {
+ cell_count_check += range.second - range.first;
+ }
+ GGML_ASSERT(cell_count == cell_count_check);
- io.write(&cell_count, sizeof(cell_count));
+ io.write(&cell_count, sizeof(cell_count));
- state_write_meta(io, cell_ranges, seq_id);
- state_write_data(io, cell_ranges);
+ // skip empty streams
+ if (cell_count == 0) {
+ continue;
+ }
+
+ state_write_meta(io, cr, seq_id);
+ state_write_data(io, cr);
+ }
}
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
- uint32_t cell_count;
- io.read_to(&cell_count, sizeof(cell_count));
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
- bool res = true;
- res = res && state_read_meta(io, cell_count, seq_id);
- res = res && state_read_data(io, cell_count);
+ uint32_t n_stream_cur;
+ io.read_to(&n_stream_cur, sizeof(n_stream_cur));
+ if (n_stream_cur != n_stream) {
+ throw std::runtime_error("n_stream mismatch");
+ }
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ uint32_t cell_count;
+ io.read_to(&cell_count, sizeof(cell_count));
+
+ if (cell_count == 0) {
+ continue;
+ }
+
+ const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
- if (!res) {
- if (seq_id == -1) {
- clear(true);
- } else {
- seq_rm(seq_id, -1, -1);
+ bool res = true;
+ res = res && state_read_meta(io, strm, cell_count, seq_id);
+ res = res && state_read_data(io, strm, cell_count);
+
+ if (!res) {
+ if (seq_id == -1) {
+ clear(true);
+ } else {
+ seq_rm(seq_id, -1, -1);
+ }
+ throw std::runtime_error("failed to restore kv cache");
}
- throw std::runtime_error("failed to restore kv cache");
}
}
-void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
- for (const auto & range : cell_ranges) {
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
+ const auto & cells = v_cells[cr.strm];
+
+ for (const auto & range : cr.data) {
for (uint32_t i = range.first; i < range.second; ++i) {
std::vector<llama_seq_id> seq_ids;
}
}
-void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
+ const auto & cells = v_cells[cr.strm];
+
const uint32_t v_trans = this->v_trans ? 1 : 0;
const uint32_t n_layer = layers.size();
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ auto * k = layer.k_stream[cr.strm];
+
// Write key type
- const int32_t k_type_i = (int32_t)layer.k->type;
+ const int32_t k_type_i = (int32_t) k->type;
io.write(&k_type_i, sizeof(k_type_i));
// Write row size of key
- const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+ const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
io.write(&k_size_row, sizeof(k_size_row));
// Read each range of cells of k_size length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
+ for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row;
- io.write_tensor(layer.k, range.first * k_size_row, buf_size);
+ io.write_tensor(k, range.first * k_size_row, buf_size);
}
}
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ auto * v = layer.v_stream[cr.strm];
+
// Write value type
- const int32_t v_type_i = (int32_t)layer.v->type;
+ const int32_t v_type_i = (int32_t) v->type;
io.write(&v_type_i, sizeof(v_type_i));
// Write row size of value
- const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+ const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
io.write(&v_size_row, sizeof(v_size_row));
// Read each range of cells of v_size length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
+ for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row;
- io.write_tensor(layer.v, range.first * v_size_row, buf_size);
+ io.write_tensor(v, range.first * v_size_row, buf_size);
}
}
} else {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ auto * v = layer.v_stream[cr.strm];
+
// Write value type
- const int32_t v_type_i = (int32_t)layer.v->type;
+ const int32_t v_type_i = (int32_t) v->type;
io.write(&v_type_i, sizeof(v_type_i));
// Write element size
- const uint32_t v_size_el = ggml_type_size(layer.v->type);
+ const uint32_t v_size_el = ggml_type_size(v->type);
io.write(&v_size_el, sizeof(v_size_el));
// Write GQA embedding size
// For each row, we get the element values of each cell
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
// Read each range of cells of v_size_el length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
+ for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
const size_t buf_size = range_size * v_size_el;
- io.write_tensor(layer.v, src_offset, buf_size);
+ io.write_tensor(v, src_offset, buf_size);
}
}
}
}
}
-bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
+ auto & cells = v_cells[strm];
+ auto & head = v_heads[strm];
+
if (dest_seq_id != -1) {
// single sequence
-
seq_rm(dest_seq_id, -1, -1);
llama_batch_allocr balloc(hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
+ ubatch.seq_id_unq[0] = dest_seq_id;
+
for (uint32_t i = 0; i < cell_count; ++i) {
llama_pos pos;
uint32_t n_seq_id;
// keep the head at the old position because we will read the KV data into it in state_read_data()
head = head_cur;
+ LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
+
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
// Assume that this is one contiguous block of cells
GGML_ASSERT(head_cur + cell_count <= cells.size());
return true;
}
-bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
+ auto & cells = v_cells[strm];
+ auto & head = v_heads[strm];
+
uint32_t v_trans;
uint32_t n_layer;
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ auto * k = layer.k_stream[strm];
+
// Read type of key
int32_t k_type_i_ref;
io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
- const int32_t k_type_i = (int32_t) layer.k->type;
+ const int32_t k_type_i = (int32_t) k->type;
if (k_type_i != k_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
return false;
// Read row size of key
uint64_t k_size_row_ref;
io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
- const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+ const size_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
if (k_size_row != k_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
return false;
if (cell_count) {
// Read and set the keys for the whole cell range
- ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+ ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
}
}
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ auto * v = layer.v_stream[strm];
+
// Read type of value
int32_t v_type_i_ref;
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
- const int32_t v_type_i = (int32_t)layer.v->type;
+ const int32_t v_type_i = (int32_t) v->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
// Read row size of value
uint64_t v_size_row_ref;
io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
- const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+ const size_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
if (v_size_row != v_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
return false;
if (cell_count) {
// Read and set the values for the whole cell range
- ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
}
}
} else {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ auto * v = layer.v_stream[strm];
+
// Read type of value
int32_t v_type_i_ref;
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
- const int32_t v_type_i = (int32_t)layer.v->type;
+ const int32_t v_type_i = (int32_t) v->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
// Read element size of value
uint32_t v_size_el_ref;
io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
- const size_t v_size_el = ggml_type_size(layer.v->type);
+ const size_t v_size_el = ggml_type_size(v->type);
if (v_size_el != v_size_el_ref) {
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
return false;
// For each row in the transposed matrix, read the values for the whole cell range
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
- ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
}
}
}
llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
n_kv = kv->get_size();
+ const uint32_t n_stream = kv->get_n_stream();
+
// create a dummy slot info - the actual data is irrelevant. we just need to build the graph
sinfos.resize(1);
- sinfos[0].idxs.resize(1);
- sinfos[0].idxs[0] = 0;
+ sinfos[0].s0 = 0;
+ sinfos[0].s1 = n_stream - 1;
+ sinfos[0].idxs.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ sinfos[0].strm.push_back(s);
+ sinfos[0].idxs[s].resize(1, 0);
+ }
}
llama_kv_cache_unified_context::llama_kv_cache_unified_context(
llama_kv_cache_unified * kv,
llama_context * lctx,
bool do_shift,
- defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
- if (!do_shift && this->dinfo.empty()) {
+ defrag_info dinfo,
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) {
+ if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
}
}
// no ubatches -> this is a KV cache update
if (ubatches.empty()) {
- kv->update(lctx, do_shift, dinfo);
+ kv->update(lctx, do_shift, dinfo, sc_info);
return true;
}
}
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
- return kv->get_k(ctx, il, n_kv);
+ return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
}
ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const {
- return kv->get_v(ctx, il, n_kv);
+ return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
}
ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {