params.n_keep = value;
}
));
+ add_opt(common_arg(
+ {"--swa-full"},
+ string_format("use full-size SWA cache (default: %s)\n"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+ [](common_params & params) {
+ params.swa_full = true;
+ }
+ ));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
+ cparams.swa_full = params.swa_full;
if (params.reranking) {
cparams.embeddings = true;
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits)
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
- bool no_perf; // whether to measure performance timings
- bool op_offload; // whether to offload host tensor operations to device
+ bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+ bool flash_attn; // use flash attention [EXPERIMENTAL]
+ bool no_perf; // measure performance timings
+ bool op_offload; // offload host tensor operations to device
+ bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
};
// model quantization parameters
llama_pos p1,
int d);
+ // Returns the smallest position present in the KV cache for the specified sequence
+ // This is typically non-zero only for SWA caches
+ // Return -1 if the sequence is empty
+ LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+ struct llama_context * ctx,
+ llama_seq_id seq_id);
+
// Returns the largest position present in the KV cache for the specified sequence
+ // Return -1 if the sequence is empty
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
struct llama_context * ctx,
- llama_seq_id seq_id);
+ llama_seq_id seq_id);
// Defragment the KV cache
// This will be applied:
// Requires KV cache.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
- // 0 - success
- // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- // < 0 - error. the KV cache state is restored to the state before this call
+ // Upon non-zero return values, the KV cache state is restored to the state before this call
+ // 0 - success
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+ // 2 - aborted
+ // -1 - invalid input batch
+ // < -1 - error
LLAMA_API int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch);
}
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
cparams.op_offload = params.op_offload;
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
// init the memory module
if (!hparams.vocab_only) {
llama_memory_params params_mem = {
- /*.type_k =*/ params.type_k,
- /*.type_v =*/ params.type_v,
+ /*.type_k =*/ params.type_k,
+ /*.type_v =*/ params.type_v,
+ /*.swa_full =*/ params.swa_full,
};
memory.reset(model.create_memory(params_mem, cparams));
// find KV slot
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
return 1;
}
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.op_offload =*/ true,
+ /*.swa_full =*/ true,
};
return result;
kv->seq_div(seq_id, p0, p1, d);
}
+llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
+ const auto * kv = ctx->get_kv_self();
+ if (!kv) {
+ return -1;
+ }
+
+ return kv->seq_pos_min(seq_id);
+}
+
// deprecated
llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
return llama_kv_self_seq_pos_max(ctx, seq_id);
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
const auto * kv = ctx->get_kv_self();
if (!kv) {
- return 0;
+ return -1;
}
return kv->seq_pos_max(seq_id);
int32_t llama_decode(
llama_context * ctx,
llama_batch batch) {
- const int ret = ctx->decode(batch);
+ int ret = ctx->decode(batch);
+
+ // defrag and try again
+ // TODO: distinguish return code when we are sure that even after defrag there is no space available
+ if (ret == 1) {
+ llama_kv_self_defrag(ctx);
+ ret = ctx->decode(batch);
+
+ if (ret == 1) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+ return ret;
+ }
+ }
+
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
#include <cmath>
#include <cstring>
-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
- // TODO move to hparams if a T5 variant appears that uses a different value
- const int64_t max_distance = 128;
-
- if (bidirectional) {
- n_buckets >>= 1;
- }
-
- const int64_t max_exact = n_buckets >> 1;
-
- int32_t relative_position = x - y;
- int32_t relative_bucket = 0;
-
- if (bidirectional) {
- relative_bucket += (relative_position > 0) * n_buckets;
- relative_position = abs(relative_position);
- } else {
- relative_position = -std::min<int32_t>(relative_position, 0);
- }
-
- int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
- relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
- relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-
- return relative_bucket;
-}
-
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) {
const int64_t n_tokens = ubatch->n_tokens;
void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
if (pos_bucket) {
- const int64_t n_tokens = ubatch->n_tokens;
-
- GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
- GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-
- int32_t * data = (int32_t *) pos_bucket->data;
-
- const int64_t n_kv = kv_self->n;
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- for (int i = 0; i < n_kv; ++i) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
- }
- }
- }
+ kv_self->set_input_pos_bucket(pos_bucket, ubatch);
}
}
}
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
- if (self_kq_mask || self_kq_mask_swa) {
- const int64_t n_kv = kv_self->n;
- const int64_t n_tokens = ubatch->n_tokens;
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
- const int64_t n_seqs = ubatch->n_seqs;
-
- float * data = nullptr;
- float * data_swa = nullptr;
-
- if (self_kq_mask) {
- GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
- data = (float *) self_kq_mask->data;
- }
-
- if (self_kq_mask_swa) {
- GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
- data_swa = (float *) self_kq_mask_swa->data;
- }
-
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
- // Causal mask:
- // xxx-------
- // xxxx------
- // xxxxx-----
- // Non-causal mask:
- // xxxxx-----
- // xxxxx-----
- // xxxxx-----
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
- for (int h = 0; h < 1; ++h) {
- for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
- for (int j = 0; j < n_seq_tokens; ++j) {
- const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
- for (int i = 0; i < n_kv; ++i) {
- float f;
- // mask the token if:
- if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
- || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
- ) {
- f = -INFINITY;
- } else {
- if (hparams.use_alibi) {
- f = -std::abs(kv_self->cells[i].pos - pos);
- } else {
- f = 0.0f;
- }
- }
-
- if (data) {
- data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
- }
-
- // may need to cut off old tokens for sliding window
- // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
- if (data_swa) {
- if (hparams.n_attn_chunk) {
- llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
- if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
- f = -INFINITY;
- }
- } else {
- if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
- f = -INFINITY;
- }
- }
- data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
- }
- }
- }
- }
+ if (self_kq_mask) {
+ kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ }
+}
- // mask padded tokens
- if (data) {
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
- for (int j = 0; j < n_kv; ++j) {
- data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
- }
- }
- }
+void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
+ if (self_kq_mask) {
+ kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ }
- // mask padded tokens
- if (data_swa) {
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
- for (int j = 0; j < n_kv; ++j) {
- data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
- }
- }
- }
- }
+ if (self_kq_mask_swa) {
+ kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
}
n_layer (hparams.n_layer),
n_rot (hparams.n_rot),
n_ctx (cparams.n_ctx),
- n_ctx_per_seq (cparams.n_ctx / cparams.n_seq_max),
n_head (hparams.n_head()),
n_head_kv (hparams.n_head_kv()),
n_embd_head_k (hparams.n_embd_head_k),
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
- const auto n_kv = kv_self->n;
+ const auto n_kv = kv_self->get_n();
auto & cur = inp->pos_bucket;
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
ggml_tensor * v_mla,
- bool v_trans,
float kq_scale) const {
- //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
- //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
- //const int64_t n_head = hparams.n_head(il);
- //const int64_t n_head_kv = hparams.n_head_kv(il);
+ const bool v_trans = v->nb[1] > v->nb[2];
- //const auto & n_embd_head_k = hparams.n_embd_head_k;
- //const auto & n_embd_head_v = hparams.n_embd_head_v;
+ q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+ k = ggml_permute(ctx0, k, 0, 2, 1, 3);
+ v = ggml_permute(ctx0, v, 0, 2, 1, 3);
const auto n_tokens = q->ne[1];
const auto n_head = q->ne[2];
const auto & kq_mask = inp->get_kq_mask();
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
- //cb(q, "q", il);
-
- ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
- //cb(k, "k", il);
-
- ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
- //cb(k, "v", il);
-
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = k_cur;
+ ggml_tensor * v = v_cur;
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
cb(cur, "kqv_out", il);
if (wo) {
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
- const auto n_kv = kv_self->n;
-
- inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
- //cb(inp->self_kq_mask, "KQ_mask", -1);
- ggml_set_input(inp->self_kq_mask);
-
- inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ {
+ GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
+ GGML_ASSERT(hparams.n_swa == 0 && "Use llama_kv_cache_unified_iswa for SWA");
- if (hparams.n_swa_pattern > 1) {
- GGML_ASSERT(hparams.n_swa > 0);
+ const auto n_kv = kv_self->get_n();
- inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
- //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
- ggml_set_input(inp->self_kq_mask_swa);
+ inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
+ ggml_set_input(inp->self_kq_mask);
- inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
}
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
ggml_build_forward_expand(gf, v_cur);
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
- const auto & n_ctx = cparams.n_ctx;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ // store to KV cache
+ {
+ ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
+ ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+ }
- const auto n_tokens = q_cur->ne[2];
+ const auto & kq_mask = inp->get_kq_mask();
- const bool v_trans = !cparams.flash_attn;
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = kv_self->get_k(ctx0, il);
+ ggml_tensor * v = kv_self->get_v(ctx0, il);
- // store to KV cache
- {
- const auto kv_head = kv_self->head;
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+ cb(cur, "kqv_out", il);
- GGML_ASSERT(kv_self->size == n_ctx);
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ }
- ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
- //cb(k_cache_view, "k_cache_view", il);
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
- // note: storing RoPE-ed version of K in the KV cache
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
+ return cur;
+}
- v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
+llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
+ const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
- ggml_tensor * v_cache_view = nullptr;
+ auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
- if (!v_trans) {
- v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
- } else {
- // note: the V cache is transposed when not using flash attention
- v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
- ( n_ctx)*ggml_element_size(kv_self->v_l[il]),
- (kv_head)*ggml_element_size(kv_self->v_l[il]));
+ {
+ const auto n_kv = kv_self->get_kv_base()->get_n();
- v_cur = ggml_transpose(ctx0, v_cur);
- }
- //cb(v_cache_view, "v_cache_view", il);
+ inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
+ ggml_set_input(inp->self_kq_mask);
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ }
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
+ if (hparams.n_swa_pattern > 1) {
+ GGML_ASSERT(hparams.n_swa > 0 && "Use llama_kv_cache_unified for non-SWA");
+
+ const auto n_kv = kv_self->get_kv_swa()->get_n();
+
+ inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+ //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+ ggml_set_input(inp->self_kq_mask_swa);
+
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
}
+ return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_kv_unified_iswa * inp,
+ ggml_cgraph * gf,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, k_cur);
+ ggml_build_forward_expand(gf, v_cur);
+
const bool is_swa = hparams.is_swa(il);
+ const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+
+ const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+
+ // store to KV cache
+ {
+ ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
+ ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+ }
+
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
- const auto n_kv = kv_self->n;
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = kv->get_k(ctx0, il);
+ ggml_tensor * v = kv->get_v(ctx0, il);
- const int64_t n_head_kv = hparams.n_head_kv(il);
-
- const auto & n_embd_head_k = hparams.n_embd_head_k;
- const auto & n_embd_head_v = hparams.n_embd_head_v;
-
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
- //cb(q, "q", il);
-
- ggml_tensor * k =
- ggml_view_3d(ctx0, kv_self->k_l[il],
- n_embd_head_k, n_kv, n_head_kv,
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
- 0);
- //cb(k, "k", il);
-
- ggml_tensor * v = !v_trans ?
- ggml_view_3d(ctx0, kv_self->v_l[il],
- n_embd_head_v, n_kv, n_head_kv,
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
- 0) :
- ggml_view_3d(ctx0, kv_self->v_l[il],
- n_kv, n_embd_head_v, n_head_kv,
- ggml_element_size(kv_self->v_l[il])*n_ctx,
- ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
- 0);
-
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
cb(cur, "kqv_out", il);
if (wo) {
const auto & kq_mask = inp->get_kq_mask_cross();
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
- //cb(q, "q", il);
-
- ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
- //cb(k, "k", il);
-
- ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
- //cb(k, "v", il);
-
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = k_cur;
+ ggml_tensor * v = v_cur;
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
cb(cur, "kqv_out", il);
if (wo) {
ggml_build_forward_expand(gf, cur);
}
+
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+ // TODO move to hparams if a T5 variant appears that uses a different value
+ const int64_t max_distance = 128;
+
+ if (bidirectional) {
+ n_buckets >>= 1;
+ }
+
+ const int64_t max_exact = n_buckets >> 1;
+
+ int32_t relative_position = x - y;
+ int32_t relative_bucket = 0;
+
+ if (bidirectional) {
+ relative_bucket += (relative_position > 0) * n_buckets;
+ relative_position = abs(relative_position);
+ } else {
+ relative_position = -std::min<int32_t>(relative_position, 0);
+ }
+
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+ return relative_bucket;
+}
class llama_memory_i;
class llama_kv_cache_unified;
+class llama_kv_cache_unified_iswa;
class llama_kv_cache_recurrent;
// certain models (typically multi-modal) can produce different types of graphs
void set_input(const llama_ubatch * ubatch) override;
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
+
+ const llama_hparams & hparams;
+ const llama_cparams & cparams;
+
+ const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_kv_unified_iswa(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_unified_iswa * kv_self) :
+ hparams(hparams),
+ cparams(cparams),
+ kv_self(kv_self) {
+ }
+ ~llm_graph_input_attn_kv_unified_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
const llama_hparams & hparams;
const llama_cparams & cparams;
- const llama_kv_cache_unified * kv_self;
+ const llama_kv_cache_unified_iswa * kv_self;
};
class llm_graph_input_attn_cross : public llm_graph_input_i {
const int64_t n_layer;
const int64_t n_rot;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
- const int64_t n_ctx_per_seq;
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head_k;
ggml_tensor * build_attn_mha(
ggml_cgraph * gf,
- ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
- ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
- ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+ ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
- ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
- bool v_trans,
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale) const;
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
float kq_scale,
int il) const;
+ llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_kv_unified_iswa * inp,
+ ggml_cgraph * gf,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
llm_graph_input_attn_cross * build_attn_inp_cross() const;
ggml_tensor * build_attn(
ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const;
};
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
};
+enum llama_swa_type {
+ LLAMA_SWA_TYPE_NONE = 0,
+ LLAMA_SWA_TYPE_STANDARD = 1,
+ LLAMA_SWA_TYPE_CHUNKED = 2,
+};
+
struct llama_hparams_posnet {
uint32_t n_embd;
uint32_t n_layer;
uint32_t n_embd_features = 0;
uint32_t n_layer;
uint32_t n_rot;
- uint32_t n_swa = 0; // sliding window attention (SWA)
- uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
std::array<int, 4> rope_sections;
+ // Sliding Window Attention (SWA)
+ llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+ uint32_t n_swa = 0; // the size of the sliding window (0 - no SWA)
+ uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
+
// for State Space Models
uint32_t ssm_d_conv = 0;
uint32_t ssm_d_inner = 0;
bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
+ bool use_kq_norm = true;
+ // llama4
uint32_t n_moe_layer_step = 0;
- bool use_kq_norm = true;
- uint32_t n_attn_chunk = 0;
- // values below seems to be fixed on llama4
uint32_t n_no_rope_layer_step = 4;
uint32_t n_attn_temp_floor_scale = 8192;
float f_attn_temp_scale = 0.1;
}
llama_kv_cache_unified::llama_kv_cache_unified(
- const llama_model & model,
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- bool offload,
- uint32_t kv_size,
- uint32_t padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
- const int32_t n_layer = hparams.n_layer;
-
- has_shift = false;
- can_shift = true;
-
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
-
+ const llama_model & model,
+ layer_filter_cb && filter,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ uint32_t kv_size,
+ uint32_t padding,
+ uint32_t n_swa,
+ llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
- head = 0;
- size = kv_size;
- used = 0;
-
this->type_k = type_k;
this->type_v = type_v;
- cells.clear();
- cells.resize(kv_size);
-
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
- /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+ /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
return it->second;
};
- k_l.reserve(n_layer);
- v_l.reserve(n_layer);
+ head = 0;
+ size = kv_size;
+ used = 0;
- for (int i = 0; i < n_layer; i++) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+ cells.resize(kv_size);
+
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
+ if (filter && !filter(il)) {
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+ continue;
+ }
+
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
const char * dev_name = "CPU";
ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
if (offload) {
- auto * dev = model.dev_layer(i);
+ auto * dev = model.dev_layer(il);
buft = ggml_backend_dev_buffer_type(dev);
dev_name = ggml_backend_dev_name(dev);
}
- LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
throw std::runtime_error("failed to create ggml context for kv cache");
}
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
- ggml_format_name(k, "cache_k_l%d", i);
- ggml_format_name(v, "cache_v_l%d", i);
- k_l.push_back(k);
- v_l.push_back(v);
+ ggml_tensor * k;
+ ggml_tensor * v;
+
+ k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
+ v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
+
+ ggml_format_name(k, "cache_k_l%d", il);
+ ggml_format_name(v, "cache_v_l%d", il);
+
+ map_layer_ids[il] = layers.size();
+ layers.push_back({ il, k, v });
}
// allocate tensors and initialize the buffers to avoid NaNs in the padding
if (!buf) {
throw std::runtime_error("failed to allocate buffer for kv cache");
}
- ggml_backend_buffer_clear(buf, 0);
+
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+ ggml_backend_buffer_clear(buf, 0);
bufs.emplace_back(buf);
}
const size_t memory_size_k = size_k_bytes();
const size_t memory_size_v = size_v_bytes();
- LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
}
void llama_kv_cache_unified::clear() {
- for (int32_t i = 0; i < (int32_t) size; ++i) {
+ for (uint32_t i = 0; i < size; ++i) {
cells[i].pos = -1;
cells[i].seq_id.clear();
}
+
head = 0;
used = 0;
} else {
continue;
}
+
if (cells[i].is_empty()) {
// keep count of the number of used cells
if (cells[i].pos >= 0) {
for (uint32_t i = 0; i < size; ++i) {
if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
has_shift = true;
+
cells[i].pos += delta;
cells[i].delta += delta;
}
}
-llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
- llama_pos result = 0;
+llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
+ llama_pos result = std::numeric_limits<llama_pos>::max();
for (uint32_t i = 0; i < size; ++i) {
if (cells[i].has_seq_id(seq_id)) {
- result = std::max(result, cells[i].pos);
+ result = std::min(result, cells[i].pos);
}
}
+ if (result == std::numeric_limits<llama_pos>::max()) {
+ result = -1;
+ }
+
return result;
}
-void llama_kv_cache_unified::restore() {
- if (pending.ranges.empty()) {
- return;
- }
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+ llama_pos result = -1;
- uint32_t new_head = size;
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].has_seq_id(seq_id)) {
+ result = std::max(result, cells[i].pos);
+ }
+ }
- for (auto & range : pending.ranges) {
- for (uint32_t i = range.c0; i < range.c1; ++i) {
- cells[i].seq_id.clear();
+ return result;
+}
- // keep count of the number of used cells
- if (cells[i].pos >= 0) {
- used--;
- }
+void llama_kv_cache_unified::restore() {
+ for (const auto & [id, cell] : recovery.cells) {
+ // TODO: move to new `struct kv_cells`
+ const bool is_empty0 = cells[id].is_empty();
+ const bool is_empty1 = cell.is_empty();
- cells[i].pos = -1;
+ if (!is_empty0 && is_empty1) {
+ used--;
+ } else if (is_empty0 && !is_empty1) {
+ used++;
}
- new_head = std::min(new_head, range.c0);
+ cells[id] = cell;
}
- if (new_head != size && new_head < head) {
- head = new_head;
- }
+ recovery.clear();
}
void llama_kv_cache_unified::commit() {
- if (pending.ranges.empty()) {
- LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
- __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
+ if (recovery.cells.empty()) {
+ LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
return;
}
- pending.ranges.clear();
+ recovery.clear();
}
bool llama_kv_cache_unified::update(llama_context & lctx) {
head = 0;
}
-llama_sbatch llama_kv_cache_unified::sbatch_init(
- const llama_batch & batch,
- bool logits_all) {
+llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
return llama_sbatch(batch, hparams.n_embd, true, logits_all);
}
-llama_ubatch llama_kv_cache_unified::ubatch_next(
- llama_sbatch & sbatch,
- uint32_t n_ubatch,
- bool embd_pooled) const {
+llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
GGML_UNUSED(embd_pooled);
return sbatch.split_simple(n_ubatch);
}
-bool llama_kv_cache_unified::find_slot(
- const llama_ubatch & ubatch) {
+bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
const uint32_t n_tokens = ubatch.n_tokens;
- const uint32_t n_seqs = ubatch.n_seqs;
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
return false;
}
+//#define FIND_SLOT_DEBUG 1
+#if FIND_SLOT_DEBUG
+ LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
+
+ // for debugging
+ {
+ std::string ss;
+ if (n_swa > 0) {
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].pos == -1) {
+ ss += '.';
+ } else {
+ ss += std::to_string(*cells[i].seq_id.begin());
+ }
+ if (i%256 == 255) {
+ ss += '\n';
+ }
+ }
+ }
+ LLAMA_LOG_WARN("\n%s\n", ss.c_str());
+ }
+#endif
+
uint32_t n_tested = 0;
while (true) {
}
}
- for (uint32_t s = 0; s < n_seqs; s++) {
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
- uint32_t k = s*n_seq_tokens + i;
- cells[head + k].pos = ubatch.pos[k];
+ for (uint32_t i = 0; i < n_tokens; ++i) {
+ // remember the original state
+ if (recovery.cells.find(head + i) == recovery.cells.end()) {
+ recovery.cells[head + i] = cells[head + i];
+ }
+
+ cells[head + i].pos = ubatch.pos[i];
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
- cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
- }
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
+ cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
}
}
used += n_tokens;
- pending.ranges.push_back({head, head + n_tokens});
-
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
- //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
+#ifdef FIND_SLOT_DEBUG
+ LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
+#endif
return true;
}
}
bool llama_kv_cache_unified::get_can_shift() const {
- return can_shift;
+ return true;
+}
+
+uint32_t llama_kv_cache_unified::get_n() const {
+ return n;
+}
+
+uint32_t llama_kv_cache_unified::get_size() const {
+ return size;
+}
+
+ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * k = layers[ikv].k;
+
+ return ggml_view_3d(ctx, k,
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n,
+ ggml_row_size(k->type, hparams.n_embd_head_k),
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
+ 0);
+}
+
+ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * v = layers[ikv].v;
+
+ if (!v_trans) {
+ // note: v->nb[1] <= v->nb[2]
+ return ggml_view_3d(ctx, v,
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n,
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
+ 0);
+ }
+
+ // note: v->nb[1] > v->nb[2]
+ return ggml_view_3d(ctx, v,
+ n, hparams.n_head_kv(il), hparams.n_embd_head_v,
+ ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, v->ne[1]), // v->nb[2]
+ 0);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * k = layers[ikv].k;
+
+ const int64_t n_tokens = k_cur->ne[2];
+
+ ggml_tensor * k_view = ggml_view_1d(ctx, k,
+ n_tokens*hparams.n_embd_k_gqa(il),
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
+
+ return ggml_cpy(ctx, k_cur, k_view);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * v = layers[ikv].v;
+
+ const int64_t n_tokens = v_cur->ne[2];
+
+ v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
+
+ ggml_tensor * v_view = nullptr;
+
+ if (!v_trans) {
+ v_view = ggml_view_1d(ctx, v,
+ n_tokens*hparams.n_embd_v_gqa(il),
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
+ } else {
+ // note: the V cache is transposed when not using flash attention
+ v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
+ (v->ne[1])*ggml_element_size(v),
+ ( head)*ggml_element_size(v));
+
+ v_cur = ggml_transpose(ctx, v_cur);
+ }
+
+ return ggml_cpy(ctx, v_cur, v_view);
+}
+
+void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
+ // no pruning is needed when the cache does not use SWA
+ GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
+
+ int n_attended = 0;
+
+ for (uint32_t i = 0; i < size; ++i) {
+ const llama_pos p0 = cells[i].pos;
+
+ if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
+ n_attended++;
+ }
+
+ if (is_masked_swa(p0, pmax)) {
+ if (seq_id < 0) {
+ cells[i].seq_id.clear();
+ } else if (cells[i].has_seq_id(seq_id)) {
+ cells[i].seq_id.erase(seq_id);
+ } else {
+ continue;
+ }
+
+ if (cells[i].is_empty()) {
+ // keep count of the number of used cells
+ if (cells[i].pos >= 0) {
+ used--;
+ }
+
+ cells[i].pos = -1;
+ }
+ }
+ }
+
+ if (n_attended < std::min<int>(n_swa, pmin)) {
+ LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
+ }
+}
+
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+ const int64_t n_tokens = ubatch->n_tokens;
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+ const int64_t n_seqs = ubatch->n_seqs;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ float * data = (float *) dst->data;
+
+ const int64_t n_kv = n;
+
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+ // Causal mask:
+ // xxx-------
+ // xxxx------
+ // xxxxx-----
+ // Non-causal mask:
+ // xxxxx-----
+ // xxxxx-----
+ // xxxxx-----
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+ for (int h = 0; h < 1; ++h) {
+ for (int s = 0; s < n_seqs; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+ for (int j = 0; j < n_seq_tokens; ++j) {
+ const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
+
+ for (int i = 0; i < n_kv; ++i) {
+ const llama_pos p0 = cells[i].pos;
+
+ bool masked = false;
+
+ // mask the token if not the same sequence
+ masked = masked || (!cells[i].has_seq_id(seq_id));
+
+ // mask future tokens
+ masked = masked || (causal_attn && p0 > p1);
+
+ // apply SWA if any
+ masked = masked || (is_masked_swa(p0, p1));
+
+ float f = 0.0f;
+
+ if (masked) {
+ f = -INFINITY;
+ } else if (hparams.use_alibi) {
+ f = -std::abs(p0 - p1);
+ }
+
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+ }
+ }
+ }
+
+ // mask padded tokens
+ if (data) {
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+ for (int j = 0; j < n_kv; ++j) {
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+ }
+ }
+ }
+ }
+}
+
+void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+ int32_t * data = (int32_t *) dst->data;
+
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i] = cells[i].delta;
+ }
+}
+
+void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+ int32_t * data = (int32_t *) dst->data;
+
+ const int64_t n_kv = n;
+
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_kv; ++i) {
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
+ }
+ }
+ }
}
llama_pos llama_kv_cache_unified::get_pos_max() const {
llama_pos pos_max = -1;
+
for (const auto & cell : cells) {
pos_max = std::max(pos_max, cell.pos);
}
size_t llama_kv_cache_unified::total_size() const {
size_t size = 0;
+
for (const auto & buf : bufs) {
size += ggml_backend_buffer_get_size(buf.get());
}
size_t llama_kv_cache_unified::size_k_bytes() const {
size_t size_k_bytes = 0;
- for (const auto & k : k_l) {
- size_k_bytes += ggml_nbytes(k);
+ for (const auto & layer : layers) {
+ size_k_bytes += ggml_nbytes(layer.k);
}
return size_k_bytes;
size_t llama_kv_cache_unified::size_v_bytes() const {
size_t size_v_bytes = 0;
- for (const auto & v : v_l) {
- size_v_bytes += ggml_nbytes(v);
+ for (const auto & layer : layers) {
+ size_v_bytes += ggml_nbytes(layer.v);
}
return size_v_bytes;
GGML_UNUSED(ubatch);
if (k_shift) {
- assert(ggml_backend_buffer_is_host(k_shift->buffer));
-
- int32_t * data = (int32_t *) k_shift->data;
-
- for (uint32_t i = 0; i < kv_self->size; ++i) {
- data[i] = kv_self->cells[i].delta;
- }
+ kv_self->set_input_k_shift(k_shift);
}
}
ggml_cgraph * gf) const {
auto res = std::make_unique<llm_graph_result>();
- const auto & n_layer = hparams.n_layer;
-
const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
//GGML_ASSERT(kv_self->size == n_ctx);
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
ggml_set_input(inp->k_shift);
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
- const bool is_swa = hparams.is_swa(il);
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
- // note: the swa rope params could become part of the cparams in the future
- // if we decide to make them configurable, like the non-sliding ones
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
-
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
ggml_tensor * k =
- ggml_view_3d(ctx, k_l[il],
+ ggml_view_3d(ctx, layer.k,
n_embd_head_k, n_head_kv, size,
- ggml_row_size(k_l[il]->type, n_embd_head_k),
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+ ggml_row_size(layer.k->type, n_embd_head_k),
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
0);
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
nm++;
}
- for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
- ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
+ ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
n_embd_k_gqa, nm,
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
+ ggml_row_size(layer.k->type, n_embd_k_gqa*i));
- ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
n_embd_k_gqa, nm,
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
+ ggml_row_size(layer.k->type, n_embd_k_gqa*id));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
- view_v_src = ggml_view_2d(ctx, v_l[il],
+ view_v_src = ggml_view_2d(ctx, layer.v,
n_embd_v_gqa, nm,
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
+ ggml_row_size(layer.v->type, n_embd_v_gqa*i));
- view_v_dst = ggml_view_2d(ctx, v_l[il],
+ view_v_dst = ggml_view_2d(ctx, layer.v,
n_embd_v_gqa, nm,
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
+ ggml_row_size(layer.v->type, n_embd_v_gqa*id));
} else {
- view_v_src = ggml_view_2d(ctx, v_l[il],
+ view_v_src = ggml_view_2d(ctx, layer.v,
nm, n_embd_v_gqa,
- ggml_row_size(v_l[il]->type, size),
- ggml_row_size(v_l[il]->type, i));
+ ggml_row_size(layer.v->type, size),
+ ggml_row_size(layer.v->type, i));
- view_v_dst = ggml_view_2d(ctx, v_l[il],
+ view_v_dst = ggml_view_2d(ctx, layer.v,
nm, n_embd_v_gqa,
- ggml_row_size(v_l[il]->type, size),
- ggml_row_size(v_l[il]->type, id));
+ ggml_row_size(layer.v->type, size),
+ ggml_row_size(layer.v->type, id));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
}
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
- const uint32_t n_layer = hparams.n_layer;
+ const uint32_t n_layer = layers.size();
const uint32_t n_kv = cell_max();
const uint32_t n_used = used;
return 0;
}
+bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
+ if (p0 < 0) {
+ return true;
+ }
+
+ switch (swa_type) {
+ case LLAMA_SWA_TYPE_NONE:
+ {
+ } break;
+ case LLAMA_SWA_TYPE_STANDARD:
+ {
+ if (p1 - p0 >= (int32_t) n_swa) {
+ return true;
+ }
+ } break;
+ case LLAMA_SWA_TYPE_CHUNKED:
+ {
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+ if (p0 < pos_chunk_start) {
+ return true;
+ }
+ } break;
+ }
+
+ return false;
+}
+
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
uint32_t cell_count = 0;
void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
const uint32_t v_trans = this->v_trans ? 1 : 0;
- const uint32_t n_layer = hparams.n_layer;
+ const uint32_t n_layer = layers.size();
io.write(&v_trans, sizeof(v_trans));
io.write(&n_layer, sizeof(n_layer));
// Iterate and write all the keys first, each row is a cell
// Get whole range at a time
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
// Write key type
- const int32_t k_type_i = (int32_t)k_l[il]->type;
+ const int32_t k_type_i = (int32_t)layer.k->type;
io.write(&k_type_i, sizeof(k_type_i));
// Write row size of key
- const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+ const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
io.write(&k_size_row, sizeof(k_size_row));
// Read each range of cells of k_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row;
- io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+ io.write_tensor(layer.k, range.first * k_size_row, buf_size);
}
}
if (!v_trans) {
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Write value type
- const int32_t v_type_i = (int32_t)v_l[il]->type;
+ const int32_t v_type_i = (int32_t)layer.v->type;
io.write(&v_type_i, sizeof(v_type_i));
// Write row size of value
- const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+ const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
io.write(&v_size_row, sizeof(v_size_row));
// Read each range of cells of v_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row;
- io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+ io.write_tensor(layer.v, range.first * v_size_row, buf_size);
}
}
} else {
// When v is transposed, we also need the element size and get the element ranges from each row
const uint32_t kv_size = size;
- for (uint32_t il = 0; il < n_layer; ++il) {
+
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Write value type
- const int32_t v_type_i = (int32_t)v_l[il]->type;
+ const int32_t v_type_i = (int32_t)layer.v->type;
io.write(&v_type_i, sizeof(v_type_i));
// Write element size
- const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+ const uint32_t v_size_el = ggml_type_size(layer.v->type);
io.write(&v_size_el, sizeof(v_size_el));
// Write GQA embedding size
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
const size_t buf_size = range_size * v_size_el;
- io.write_tensor(v_l[il], src_offset, buf_size);
+ io.write_tensor(layer.v, src_offset, buf_size);
}
}
}
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
batch.n_tokens = cell_count;
- batch.n_seq_tokens = cell_count;
- batch.n_seqs = 1;
for (uint32_t i = 0; i < cell_count; ++i) {
llama_pos pos;
}
batch.pos[i] = pos;
+ batch.n_seq_id[i] = 1;
+ batch.seq_id[i] = &dest_seq_id;
}
- batch.n_seq_id[0] = 1;
- batch.seq_id[0] = &dest_seq_id;
+
if (!find_slot(batch)) {
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
return false;
}
+
commit();
// DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
uint32_t v_trans;
uint32_t n_layer;
+
io.read_to(&v_trans, sizeof(v_trans));
io.read_to(&n_layer, sizeof(n_layer));
- if (n_layer != hparams.n_layer) {
- LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+ if (n_layer != layers.size()) {
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
return false;
}
if (cell_count > size) {
}
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
// Read type of key
int32_t k_type_i_ref;
io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
- const int32_t k_type_i = (int32_t) k_l[il]->type;
+ const int32_t k_type_i = (int32_t) layer.k->type;
if (k_type_i != k_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
return false;
// Read row size of key
uint64_t k_size_row_ref;
io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
- const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+ const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
if (k_size_row != k_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
return false;
if (cell_count) {
// Read and set the keys for the whole cell range
- ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+ ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
}
}
if (!this->v_trans) {
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Read type of value
int32_t v_type_i_ref;
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
- const int32_t v_type_i = (int32_t)v_l[il]->type;
+ const int32_t v_type_i = (int32_t)layer.v->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
// Read row size of value
uint64_t v_size_row_ref;
io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
- const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+ const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
if (v_size_row != v_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
return false;
if (cell_count) {
// Read and set the values for the whole cell range
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
}
}
} else {
// For each layer, read the values for each cell (transposed)
- for (uint32_t il = 0; il < n_layer; ++il) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Read type of value
int32_t v_type_i_ref;
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
- const int32_t v_type_i = (int32_t)v_l[il]->type;
+ const int32_t v_type_i = (int32_t)layer.v->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
// Read element size of value
uint32_t v_size_el_ref;
io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
- const size_t v_size_el = ggml_type_size(v_l[il]->type);
+ const size_t v_size_el = ggml_type_size(layer.v->type);
if (v_size_el != v_size_el_ref) {
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
return false;
// For each row in the transposed matrix, read the values for the whole cell range
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
const size_t dst_offset = (head + j * size) * v_size_el;
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
}
}
}
return true;
}
+//
+// llama_kv_cache_unified_iswa
+//
+
+llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ uint32_t kv_size,
+ bool swa_full,
+ uint32_t n_seq_max,
+ uint32_t n_batch,
+ uint32_t padding) : hparams(model.hparams) {
+ llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
+ llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
+
+ const uint32_t size_base = kv_size;
+
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+
+ // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
+ if (swa_full) {
+ LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+ size_swa = size_base;
+ do_prune = false;
+ }
+
+ LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+ kv_base = std::make_unique<llama_kv_cache_unified>(
+ model, std::move(filter_base), type_k, type_v,
+ v_trans, offload, size_base, padding,
+ 0, LLAMA_SWA_TYPE_NONE);
+
+ LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+ kv_swa = std::make_unique<llama_kv_cache_unified>(
+ model, std::move(filter_swa), type_k, type_v,
+ v_trans, offload, size_swa, padding,
+ hparams.n_swa, hparams.swa_type);
+}
+
+void llama_kv_cache_unified_iswa::clear() {
+ kv_base->clear();
+ kv_swa ->clear();
+}
+
+bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ bool res = true;
+
+ res = res & kv_base->seq_rm(seq_id, p0, p1);
+ res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+ return res;
+}
+
+void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+ kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
+ kv_base->seq_keep(seq_id);
+ kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+ kv_base->seq_add(seq_id, p0, p1, delta);
+ kv_swa ->seq_add(seq_id, p0, p1, delta);
+}
+
+void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ kv_base->seq_div(seq_id, p0, p1, d);
+ kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
+ // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+ return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
+ return kv_swa->seq_pos_max(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::restore() {
+ kv_base->restore();
+ kv_swa ->restore();
+}
+
+void llama_kv_cache_unified_iswa::commit() {
+ kv_base->commit();
+ kv_swa ->commit();
+
+ // slide the attention window, forgetting/pruning old tokens that are outside the window
+ if (do_prune) {
+ for (const auto & [seq_id, entry] : pending.pos) {
+ kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
+ }
+
+ }
+
+ pending.clear();
+}
+
+bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
+ bool res = true;
+
+ res = res & kv_base->update(lctx);
+ res = res & kv_swa ->update(lctx);
+
+ return res;
+}
+
+void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
+ kv_base->defrag_sched(thold);
+ kv_swa ->defrag_sched(thold);
+}
+
+void llama_kv_cache_unified_iswa::set_full() {
+ kv_base->set_full();
+ kv_swa ->set_full();
+}
+
+llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
+ pending.clear();
+
+ if (do_prune) {
+ for (int i = 0; i < batch.n_tokens; ++i) {
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[i][s];
+ const llama_pos pos = batch.pos[i];
+
+ if (pending.pos.find(seq_id) == pending.pos.end()) {
+ pending.pos[seq_id].pmin = pos;
+ pending.pos[seq_id].pmax = pos;
+ } else {
+ pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
+ pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
+ }
+ }
+ }
+ }
+
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+}
+
+llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+ GGML_UNUSED(embd_pooled);
+ return sbatch.split_simple(n_ubatch);
+}
+
+bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
+ bool res = true;
+
+ res = res & kv_base->find_slot(batch);
+ res = res & kv_swa ->find_slot(batch);
+
+ return res;
+}
+
+int32_t llama_kv_cache_unified_iswa::get_n_tokens() const {
+ return kv_base->get_n_tokens();
+}
+
+int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
+ return kv_base->get_used_cells();
+}
+
+llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
+ return kv_base->get_pos_max();
+}
+
+bool llama_kv_cache_unified_iswa::get_can_shift() const {
+ return kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+ kv_base->state_write(io, seq_id);
+ kv_swa ->state_write(io, seq_id);
+}
+
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+ kv_base->state_read(io, seq_id);
+ kv_swa ->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
+ return kv_base.get();
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
+ return kv_swa.get();
+}
+
//
// llama_kv_cache_recurrent
//
}
}
+llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
+ llama_pos result = std::numeric_limits<llama_pos>::max();
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].has_seq_id(seq_id)) {
+ result = std::min(result, cells[i].pos);
+ }
+ }
+
+ if (result == std::numeric_limits<llama_pos>::max()) {
+ result = -1;
+ }
+
+ return result;
+}
+
llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
- llama_pos result = 0;
+ llama_pos result = -1;
for (uint32_t i = 0; i < size; ++i) {
if (cells[i].has_seq_id(seq_id)) {
io.read_to(&cell_count, sizeof(cell_count));
bool res = true;
+
res = res && state_read_meta(io, cell_count, seq_id);
res = res && state_read_data(io, cell_count);
}
}
-void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
- // TODO: rework this in the future, for now quick hack
- const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
- if (kvu == nullptr) {
- LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
- return;
- }
-
- if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
- view->n_cells = int32_t(kvu->size);
- void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
- view->cells = (llama_kv_cache_view_cell *)p;
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
- view->cells_sequences = (llama_seq_id *)p;
- }
-
- const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
- llama_kv_cache_view_cell * c_curr = view->cells;
- llama_seq_id * cs_curr = view->cells_sequences;
- int32_t used_cells = 0;
- int32_t token_count = 0;
- int32_t curr_contig_idx = -1;
- uint32_t max_contig = 0;
- int32_t max_contig_idx = -1;
-
- for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
- const size_t curr_size = kv_cells[i].seq_id.size();
- token_count += curr_size;
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
-
- if (curr_size > 0) {
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
- max_contig = i - curr_contig_idx;
- max_contig_idx = curr_contig_idx;
- }
- curr_contig_idx = -1;
- } else if (curr_contig_idx < 0) {
- curr_contig_idx = i;
- }
-
- int seq_idx = 0;
- for (const llama_seq_id it : kv_cells[i].seq_id) {
- if (seq_idx >= view->n_seq_max) {
- break;
- }
- cs_curr[seq_idx] = it;
- seq_idx++;
- }
- if (seq_idx != 0) {
- used_cells++;
- }
- for (; seq_idx < view->n_seq_max; seq_idx++) {
- cs_curr[seq_idx] = -1;
- }
- }
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
- max_contig_idx = curr_contig_idx;
- max_contig = kv_cells.size() - curr_contig_idx;
- }
- view->max_contiguous = max_contig;
- view->max_contiguous_idx = max_contig_idx;
- view->token_count = token_count;
- view->used_cells = used_cells;
- if (uint32_t(used_cells) != kvu->used) {
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
- __func__, kvu->used, used_cells);
- }
+void llama_kv_cache_view_update(llama_kv_cache_view * , const llama_kv_cache * ) {
+ // TODO: will be removed soon, keep this for now to avoid too many changes in
+ // https://github.com/ggml-org/llama.cpp/pull/13194
}
#include "ggml-cpp.h"
#include <set>
+#include <unordered_map>
#include <vector>
struct llama_cparams;
// batch processing
//
+ // =============================================================================================================
+ // TODO: refactor and simplify this
+
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
// different KV caches require different batch splitting strategies
// find an empty slot of size "n_tokens" in the cache
virtual bool find_slot(const llama_ubatch & batch) = 0;
+ // =============================================================================================================
+
// getters
virtual int32_t get_n_tokens() const = 0;
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
// llama_kv_cache_unified
//
-// TODO: add notion of max sequences
class llama_kv_cache_unified : public llama_kv_cache {
public:
- struct kv_cell {
- llama_pos pos = -1;
- llama_pos delta = 0;
-
- std::set<llama_seq_id> seq_id;
-
- bool has_seq_id(const llama_seq_id & id) const {
- return seq_id.find(id) != seq_id.end();
- }
-
- bool is_empty() const {
- return seq_id.empty();
- }
-
- bool is_same_seq(const kv_cell & other) const {
- return seq_id == other.seq_id;
- }
- };
-
static uint32_t get_padding(const llama_cparams & cparams);
+ // this callback is used to filter out layers that should not be included in the cache
+ using layer_filter_cb = std::function<bool(int32_t il)>;
+
llama_kv_cache_unified(
- const llama_model & model,
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- bool offload,
- uint32_t kv_size,
- uint32_t padding);
+ const llama_model & model,
+ layer_filter_cb && filter,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ uint32_t kv_size,
+ uint32_t padding,
+ uint32_t n_swa,
+ llama_swa_type swa_type);
~llama_kv_cache_unified() = default;
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
- void seq_keep(llama_seq_id seq_id) override;
+ void seq_keep(llama_seq_id seq_id) override;
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
//
void set_full() override;
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
// updates the cache head
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
- uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
- uint32_t size = 0; // total number of cells, shared across all sequences
- uint32_t used = 0; // used cells (i.e. at least one seq_id)
+ //
+ // llama_kv_cache_unified specific API
+ //
- // computed before each graph build
- uint32_t n = 0;
+ uint32_t get_n() const;
+ uint32_t get_size() const;
- std::vector<kv_cell> cells;
+ // get views of the current state of the cache
+ ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+ ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
- std::vector<ggml_tensor *> k_l; // per layer
- std::vector<ggml_tensor *> v_l;
+ // store k_cur and v_cur in the cache based on the current head location
+ ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+ ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+ void prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax);
+
+ void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+ void set_input_k_shift (ggml_tensor * dst) const;
+ void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
private:
const llama_model & model;
const llama_hparams & hparams;
+ struct kv_cell {
+ llama_pos pos = -1;
+ llama_pos delta = 0;
+
+ // TODO: replace with bitset uint64_t
+ std::set<llama_seq_id> seq_id;
+
+ bool has_seq_id(const llama_seq_id & id) const {
+ return seq_id.find(id) != seq_id.end();
+ }
+
+ bool is_empty() const {
+ return seq_id.empty();
+ }
+
+ bool is_same_seq(const kv_cell & other) const {
+ return seq_id == other.seq_id;
+ }
+ };
+
+ struct kv_layer {
+ // layer index in the model
+ // note: can be different from the layer index in the KV cache
+ uint32_t il;
+
+ ggml_tensor * k;
+ ggml_tensor * v;
+ };
+
bool has_shift = false;
bool do_defrag = false;
-
bool v_trans = true; // the value tensor is transposed
- bool can_shift = false;
+
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+ uint32_t size = 0; // total number of cells, shared across all sequences
+ uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt)
+
+ // computed before each graph build
+ uint32_t n = 0;
// required padding
uint32_t padding = 1;
ggml_type type_k = GGML_TYPE_F16;
ggml_type type_v = GGML_TYPE_F16;
+ // SWA
+ uint32_t n_swa = 0;
+
+ llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
+ std::vector<kv_cell> cells; // TODO: replace with `struct kv_cells`
+ std::vector<kv_layer> layers;
+
+ // model layer id -> KV cache layer id
+ std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+ // recovery information used to restore the KV cells to their original state in case of a failure
+ struct {
+ void clear() {
+ cells.clear();
+ }
+
+ std::unordered_map<uint32_t, kv_cell> cells;
+ } recovery;
+
// defrag
struct {
std::vector<uint32_t> ids;
// return true if cells have been moved
bool defrag_prepare(int32_t n_max_nodes);
- // commit/restore cache
- struct slot_range {
- uint32_t c0 = 0; // note: these are cell indices, not sequence positions
- uint32_t c1 = 0;
- };
-
- // pending cell updates that are not yet committed
- struct {
- std::vector<slot_range> ranges;
- } pending;
-
// find how many cells are currently in use
uint32_t cell_max() const;
size_t size_k_bytes() const;
size_t size_v_bytes() const;
+ bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
ggml_tensor * build_rope_shift(
const llama_cparams & cparams,
ggml_context * ctx,
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
};
+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+// upon successful commit, the SWA cache removes old tokens outside the n_swa window
+
+class llama_kv_cache_unified_iswa : public llama_kv_cache {
+public:
+ llama_kv_cache_unified_iswa(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ uint32_t kv_size,
+ bool swa_full,
+ uint32_t n_seq_max,
+ uint32_t n_batch,
+ uint32_t padding);
+
+ ~llama_kv_cache_unified_iswa() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ void clear() override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ //
+ // llama_kv_cache
+ //
+
+ void restore() override;
+ void commit() override;
+
+ bool update(llama_context & ctx) override;
+
+ void defrag_sched(float thold) override;
+
+ void set_full() override;
+
+ llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+ llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
+ bool find_slot(const llama_ubatch & batch) override;
+
+ int32_t get_n_tokens() const override;
+ int32_t get_used_cells() const override;
+
+ // TODO: better data structures to reduce the cost of this operation
+ llama_pos get_pos_max() const override;
+
+ bool get_can_shift() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
+
+ //
+ // llama_kv_cache_unified_iswa specific API
+ //
+
+ llama_kv_cache_unified * get_kv_base() const;
+ llama_kv_cache_unified * get_kv_swa () const;
+
+private:
+ const llama_hparams & hparams;
+
+ bool do_prune = true;
+
+ struct {
+ struct entry {
+ llama_pos pmin;
+ llama_pos pmax;
+ };
+
+ void clear() {
+ pos.clear();
+ }
+
+ // used to perform SWA pruning of old tokens
+ std::unordered_map<llama_seq_id, entry> pos;
+ } pending;
+
+ std::unique_ptr<llama_kv_cache_unified> kv_base;
+ std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
//
// llama_kv_cache_recurrent
//
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
//
void set_full() override;
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
bool find_slot(const llama_ubatch & batch) override;
ggml_type type_k;
ggml_type type_v;
- // parameters for other types of memory
- // ...
+ // use full-size SWA cache
+ bool swa_full;
};
// general concept of LLM memory
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
+ virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
virtual bool get_can_edit() const = 0;
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
+ hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
switch (hparams.n_expert) {
case 16: type = LLM_TYPE_17B_16E; break;
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
+ LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
hparams.n_swa = 2047;
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
// default value for Phi-3-mini-128k-instruct
- // note: this seems incorrect because the window is bigger than the train context?
- hparams.n_swa = 262144;
+ LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+ hparams.n_swa = hparams.n_ctx_train;
+ hparams.n_swa_pattern = 1;
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
// default value for Phi-3-medium-128k-instruct
- // note: this seems incorrect because the window is equal to the train context?
- hparams.n_swa = 131072;
+ LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+ hparams.n_swa = hparams.n_ctx_train;
+ hparams.n_swa_pattern = 1;
}
+
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (!found_swa && hparams.n_swa == 0) {
throw std::runtime_error("invalid value for sliding_window");
}
+
+ if (hparams.n_swa > hparams.n_ctx_train) {
+ LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+ hparams.n_swa = hparams.n_ctx_train;
+ hparams.n_swa_pattern = 1;
+ }
} break;
case LLM_ARCH_PHIMOE:
{
} break;
case LLM_ARCH_GEMMA2:
{
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096; // default value of gemma 2
hparams.n_swa_pattern = 2;
hparams.attn_soft_cap = true;
} break;
case LLM_ARCH_GEMMA3:
{
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa_pattern = 6;
hparams.rope_freq_base_train_swa = 10000.0f;
} break;
case LLM_ARCH_COHERE2:
{
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa_pattern = 4;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
return it->second;
}
-ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
+float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+}
+
+float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+}
+
+ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
// choose long/short freq factors based on the context size
if (layers[il].rope_freqs != nullptr) {
return layers[il].rope_freqs;
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_unified();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+};
+
+struct llm_build_llama_iswa : public llm_graph_context {
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
// temperature tuning
ggml_tensor * inp_attn_scale = nullptr;
- if (arch == LLM_ARCH_LLAMA4) {
- inp_attn_scale = build_inp_attn_scale();
- }
+ inp_attn_scale = build_inp_attn_scale();
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
- bool use_rope = arch == LLM_ARCH_LLAMA4
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
- : true;
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
// norm
cur = build_norm(inpL,
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
+ if (use_rope && hparams.use_kq_norm) {
// Llama4TextL2Norm
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- } else if (arch == LLM_ARCH_LLAMA4) {
+ {
// llama4 MoE
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
cur = ggml_add(ctx0, moe_out, shexp_out);
cb(cur, "ffn_moe_out_merged", il);
-
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
} else if (n_head > 0) {
// self-attention
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
}
};
-struct llm_build_phi3 : public llm_graph_context {
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_phi3_iswa : public llm_graph_context {
+ llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
for (int il = 0; il < n_layer; ++il) {
auto * residual = inpL;
// self-attention
{
// rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
ggml_tensor* attn_norm_output = build_norm(inpL,
model.layers[il].attn_norm,
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// norm
cur = build_norm(inpL,
}
};
-struct llm_build_gemma2 : public llm_graph_context {
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma2_iswa : public llm_graph_context {
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
ggml_tensor * cur;
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
for (int il = 0; il < n_layer; ++il) {
// norm
}
};
-struct llm_build_gemma3 : public llm_graph_context {
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma3_iswa : public llm_graph_context {
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
ggml_tensor * cur;
ggml_tensor * inp_pos = build_inp_pos();
// TODO: is causal == true correct? might need some changes
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
for (int il = 0; il < n_layer; ++il) {
- const bool is_swa = hparams.is_swa(il);
-
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
}
};
-struct llm_build_cohere2 : public llm_graph_context {
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_cohere2_iswa : public llm_graph_context {
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.is_swa(il);
// self-attention
{
// rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
if (use_rope) {
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
- res = new llama_kv_cache_unified(
- *this,
- params.type_k,
- params.type_v,
- !cparams.flash_attn,
- cparams.offload_kqv,
- cparams.n_ctx,
- padding);
+ if (hparams.n_swa > 0) {
+ res = new llama_kv_cache_unified_iswa(
+ *this,
+ params.type_k,
+ params.type_v,
+ !cparams.flash_attn,
+ cparams.offload_kqv,
+ cparams.n_ctx,
+ params.swa_full,
+ cparams.n_seq_max,
+ cparams.n_batch,
+ padding);
+ } else {
+ res = new llama_kv_cache_unified(
+ *this,
+ nullptr,
+ params.type_k,
+ params.type_v,
+ !cparams.flash_attn,
+ cparams.offload_kqv,
+ cparams.n_ctx,
+ padding,
+ hparams.n_swa,
+ hparams.swa_type);
+ }
}
}
switch (arch) {
case LLM_ARCH_LLAMA:
- case LLM_ARCH_LLAMA4:
case LLM_ARCH_MINICPM:
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
+ case LLM_ARCH_LLAMA4:
+ {
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
+ } break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
case LLM_ARCH_PHI3:
case LLM_ARCH_PHIMOE:
{
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
+ llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
} break;
case LLM_ARCH_PLAMO:
{
} break;
case LLM_ARCH_GEMMA2:
{
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
} break;
case LLM_ARCH_GEMMA3:
{
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
} break;
case LLM_ARCH_STARCODER2:
{
} break;
case LLM_ARCH_COHERE2:
{
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
} break;
case LLM_ARCH_DBRX:
{
const struct ggml_tensor * get_tensor(const char * name) const;
- ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+ float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+ float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
+
+ ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
// note: can mutate `cparams`
// TODO: move this to new llm_arch_model_i interface
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
+ cparams.swa_full = false;
return cparams;
}
}
}
+ if (!llama_kv_self_can_shift(ctx)) {
+ if (params_base.ctx_shift) {
+ params_base.ctx_shift = false;
+ SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
+ }
+
+ if (params_base.n_cache_reuse) {
+ params_base.n_cache_reuse = 0;
+ SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
+ }
+
+ if (!params_base.speculative.model.path.empty()) {
+ SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
+ return false;
+ }
+ }
+
return true;
}
// if we don't cache the prompt, we have to remove the entire KV cache
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
slot.n_past = 0;
- slot.cache_tokens.clear();
+ slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
+ }
+
+ if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
+ if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+ SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
+ "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+ slot.n_past = 0;
+ }
}
}