const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
-
- const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
-
- if (!supports_set_rows) {
- // ref: https://github.com/ggml-org/llama.cpp/pull/14363
- GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
- }
-
- if (!supports_set_rows) {
- LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
- }
}
void llama_kv_cache::clear(bool data) {
bool success = true;
for (const auto & ubatch : ubatches) {
- // non-continuous slots require support for ggml_set_rows()
- const bool cont = supports_set_rows ? false : true;
-
// only find a suitable slot for the ubatch. don't modify the cells yet
- const auto sinfo_new = find_slot(ubatch, cont);
+ const auto sinfo_new = find_slot(ubatch, true);
if (sinfo_new.empty()) {
success = false;
break;
return result;
}
-bool llama_kv_cache::get_supports_set_rows() const {
- return supports_set_rows;
-}
-
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
const int32_t ikv = map_layer_ids.at(il);
}
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
+ GGML_UNUSED(sinfo);
+
const int32_t ikv = map_layer_ids.at(il);
auto * k = layers[ikv].k;
- const int64_t n_embd_k_gqa = k->ne[0];
const int64_t n_tokens = k_cur->ne[2];
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
- if (k_idxs && supports_set_rows) {
- if (k->ne[2] > 1) {
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
- }
-
- return ggml_set_rows(ctx, k, k_cur, k_idxs);
+ if (k->ne[2] > 1) {
+ k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
}
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
- // will be removed when ggml_set_rows() is adopted by all backends
-
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
-
- ggml_tensor * k_view = ggml_view_1d(ctx, k,
- n_tokens*n_embd_k_gqa,
- ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
-
- return ggml_cpy(ctx, k_cur, k_view);
+ return ggml_set_rows(ctx, k, k_cur, k_idxs);
}
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
+ GGML_UNUSED(sinfo);
+
const int32_t ikv = map_layer_ids.at(il);
auto * v = layers[ikv].v;
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
- if (v_idxs && supports_set_rows) {
- if (!v_trans) {
- if (v->ne[2] > 1) {
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
- }
-
- return ggml_set_rows(ctx, v, v_cur, v_idxs);
- }
-
- // [TAG_V_CACHE_VARIABLE]
- if (n_embd_v_gqa < v->ne[0]) {
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
+ if (!v_trans) {
+ if (v->ne[2] > 1) {
+ v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
}
- // the row becomes a single element
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
-
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
-
- return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
+ return ggml_set_rows(ctx, v, v_cur, v_idxs);
}
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
- // will be removed when ggml_set_rows() is adopted by all backends
-
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
+ // [TAG_V_CACHE_VARIABLE]
+ if (n_embd_v_gqa < v->ne[0]) {
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
+ }
- ggml_tensor * v_view = nullptr;
+ // the row becomes a single element
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
- if (!v_trans) {
- v_view = ggml_view_1d(ctx, v,
- n_tokens*n_embd_v_gqa,
- ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
- } else {
- v_cur = ggml_transpose(ctx, v_cur);
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
- v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
- (v->ne[1] )*ggml_element_size(v),
- (sinfo.head())*ggml_element_size(v));
- }
-
- return ggml_cpy(ctx, v_cur, v_view);
+ return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
}
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
}
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
- if (!supports_set_rows) {
- return;
- }
-
const uint32_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
}
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
- if (!supports_set_rows) {
- return;
- }
-
const uint32_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
return n_kv;
}
-bool llama_kv_cache_context::get_supports_set_rows() const {
- return kv->get_supports_set_rows();
-}
-
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
}