auto generate = [&](const std::string & prompt) {
std::string response;
- const bool is_first = llama_kv_self_used_cells(ctx) == 0;
+ const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
- int n_ctx_used = llama_kv_self_used_cells(ctx);
+ int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
// Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
- LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+ "Use llama_kv_self_seq_pos_max() instead");
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
- LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+ "Use llama_kv_self_seq_pos_max() instead");
// Clear the KV cache - both cell info is erased and KV data is zeroed
LLAMA_API void llama_kv_self_clear(
#include "llama-batch.h"
+#include <cassert>
#include <cstring>
#include <algorithm>
batch = in_batch;
GGML_ASSERT(batch.n_tokens > 0);
if (!batch.pos) {
+ assert(p0 >= 0);
pos.resize(batch.n_tokens);
for (int32_t i = 0; i < batch.n_tokens; i++) {
- pos[i] = i + p0;
+ pos[i] = p0 + i;
}
batch.pos = pos.data();
}
return -1;
}
+ if (!inp_batch.pos) {
+ if (inp_batch.seq_id) {
+ LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+ return -1;
+ }
+ }
+
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
// temporary allocate memory for the input batch if needed
- // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
- llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
+ llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
const llama_batch & batch = batch_allocr.batch;
// kv cache
//
+// deprecated
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
const auto * kv = ctx->get_kv_self();
if (!kv) {
return 0;
}
- return kv->get_n_tokens();
+ int32_t res = 0;
+
+ for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+ const llama_pos p0 = kv->seq_pos_min(s);
+ const llama_pos p1 = kv->seq_pos_max(s);
+
+ if (p0 >= 0) {
+ res += (p1 - p0) + 1;
+ }
+ }
+
+ return res;
}
+// deprecated
+// note: this is the same as above - will be removed anyway, so it's ok
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
const auto * kv = ctx->get_kv_self();
if (!kv) {
return 0;
}
- return kv->get_used_cells();
+ int32_t res = 0;
+
+ for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+ const llama_pos p0 = kv->seq_pos_min(s);
+ const llama_pos p1 = kv->seq_pos_max(s);
+
+ if (p0 >= 0) {
+ res += (p1 - p0) + 1;
+ }
+ }
+
+ return res;
}
void llama_kv_self_clear(llama_context * ctx) {
bool v_trans,
bool offload,
uint32_t kv_size,
- uint32_t padding,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
uint32_t n_swa,
- llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
- GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
+ llama_swa_type swa_type) :
+ model(model), hparams(model.hparams), v_trans(v_trans),
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
- this->type_k = type_k;
- this->type_v = type_v;
+ GGML_ASSERT(kv_size % n_pad == 0);
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
const size_t memory_size_k = size_k_bytes();
const size_t memory_size_v = size_v_bytes();
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
void llama_kv_cache_unified::defrag_sched(float thold) {
// - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
- const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
+ const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
// queue defragmentation for next llama_kv_cache_update
if (fragmentation > thold) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
- n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
+ n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
#ifdef FIND_SLOT_DEBUG
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
return true;
}
-int32_t llama_kv_cache_unified::get_n_tokens() const {
- int32_t result = 0;
-
- for (uint32_t i = 0; i < size; i++) {
- result += cells[i].seq_id.size();
- }
-
- return result;
-}
-
-int32_t llama_kv_cache_unified::get_used_cells() const {
- return used;
-}
-
bool llama_kv_cache_unified::get_can_shift() const {
return true;
}
}
}
-llama_pos llama_kv_cache_unified::get_pos_max() const {
- llama_pos pos_max = -1;
-
- for (const auto & cell : cells) {
- pos_max = std::max(pos_max, cell.pos);
- }
-
- return pos_max;
-}
-
size_t llama_kv_cache_unified::total_size() const {
size_t size = 0;
llama_seq_id seq_id;
io.read_to(&seq_id, sizeof(seq_id));
- // TODO: llama_kv_cache_unified should have a notion of max sequences
- //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
- if (seq_id < 0) {
- //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
- LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
return false;
}
ggml_type type_v,
bool v_trans,
bool offload,
- uint32_t kv_size,
bool swa_full,
+ uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_batch,
- uint32_t padding) : hparams(model.hparams) {
+ uint32_t n_pad) : hparams(model.hparams) {
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
const uint32_t size_base = kv_size;
- uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
if (swa_full) {
kv_base = std::make_unique<llama_kv_cache_unified>(
model, std::move(filter_base), type_k, type_v,
- v_trans, offload, size_base, padding,
+ v_trans, offload, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique<llama_kv_cache_unified>(
model, std::move(filter_swa), type_k, type_v,
- v_trans, offload, size_swa, padding,
+ v_trans, offload, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type);
}
return res;
}
-int32_t llama_kv_cache_unified_iswa::get_n_tokens() const {
- return kv_base->get_n_tokens();
-}
-
-int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
- return kv_base->get_used_cells();
-}
-
-llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
- return kv_base->get_pos_max();
-}
-
bool llama_kv_cache_unified_iswa::get_can_shift() const {
return kv_base->get_size() == kv_swa->get_size();
}
ggml_type type_k,
ggml_type type_v,
bool offload,
- uint32_t kv_size) : hparams(model.hparams) {
+ uint32_t kv_size,
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer;
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
+ LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
+ __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
head = 0;
size = kv_size;
used = 0;
- this->type_k = type_k;
- this->type_v = type_v;
-
cells.clear();
cells.resize(kv_size);
pending.ranges.clear();
}
-bool llama_kv_cache_recurrent::update(llama_context & lctx) {
- GGML_UNUSED(lctx);
+bool llama_kv_cache_recurrent::update(llama_context & ctx) {
+ GGML_UNUSED(ctx);
return false;
}
if (seq_id < 0 || (uint32_t) seq_id >= size) {
// too big seq_id
// TODO: would it be possible to resize the cache instead?
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
return false;
}
if (j > 0) {
return n >= n_seqs;
}
-int32_t llama_kv_cache_recurrent::get_n_tokens() const {
- int32_t result = 0;
-
- for (uint32_t i = 0; i < size; i++) {
- result += cells[i].seq_id.size();
- }
-
- return result;
-}
-
-int32_t llama_kv_cache_recurrent::get_used_cells() const {
- return used;
-}
-
-llama_pos llama_kv_cache_recurrent::get_pos_max() const {
- llama_pos pos_max = -1;
- for (const auto & cell : cells) {
- pos_max = std::max(pos_max, cell.pos);
- }
-
- return pos_max;
-}
-
bool llama_kv_cache_recurrent::get_can_shift() const {
return false;
}
// =============================================================================================================
// getters
- virtual int32_t get_n_tokens() const = 0;
- virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
- virtual llama_pos get_pos_max() const = 0;
- virtual bool get_can_shift() const = 0;
+ virtual bool get_can_shift() const = 0;
bool get_can_edit() const override { return get_can_shift(); }
bool v_trans,
bool offload,
uint32_t kv_size,
- uint32_t padding,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type);
// to the first cell of the slot.
bool find_slot(const llama_ubatch & batch) override;
- int32_t get_n_tokens() const override;
- int32_t get_used_cells() const override;
-
- // TODO: better data structures to reduce the cost of this operation
- llama_pos get_pos_max() const override;
-
bool get_can_shift() const override;
// state write/load
// computed before each graph build
uint32_t n = 0;
- // required padding
- uint32_t padding = 1;
+ const uint32_t n_seq_max = 1;
- ggml_type type_k = GGML_TYPE_F16;
- ggml_type type_v = GGML_TYPE_F16;
+ // required padding
+ const uint32_t n_pad = 1;
// SWA
- uint32_t n_swa = 0;
+ const uint32_t n_swa = 0;
- llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+ const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
ggml_type type_v,
bool v_trans,
bool offload,
- uint32_t kv_size,
bool swa_full,
+ uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_batch,
- uint32_t padding);
+ uint32_t n_pad);
~llama_kv_cache_unified_iswa() = default;
bool find_slot(const llama_ubatch & batch) override;
- int32_t get_n_tokens() const override;
- int32_t get_used_cells() const override;
-
- // TODO: better data structures to reduce the cost of this operation
- llama_pos get_pos_max() const override;
-
bool get_can_shift() const override;
// state write/load
ggml_type type_k,
ggml_type type_v,
bool offload,
- uint32_t kv_size);
+ uint32_t kv_size,
+ uint32_t n_seq_max);
~llama_kv_cache_recurrent() = default;
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
- void seq_keep(llama_seq_id seq_id) override;
+ void seq_keep(llama_seq_id seq_id) override;
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
void restore() override;
void commit() override;
- bool update(llama_context & lctx) override;
+ bool update(llama_context & ctx) override;
void defrag_sched(float thold) override;
bool find_slot(const llama_ubatch & batch) override;
- int32_t get_n_tokens() const override;
- int32_t get_used_cells() const override;
-
- // TODO: better data structures to reduce the cost of this operation
- llama_pos get_pos_max() const override;
-
bool get_can_shift() const override;
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
std::vector<slot_range> ranges;
} pending;
- ggml_type type_k = GGML_TYPE_F16;
- ggml_type type_v = GGML_TYPE_F16;
+ const uint32_t n_seq_max = 1;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
GGML_TYPE_F32,
GGML_TYPE_F32,
cparams.offload_kqv,
- std::max((uint32_t) 1, cparams.n_seq_max));
+ std::max((uint32_t) 1, cparams.n_seq_max),
+ cparams.n_seq_max);
} break;
default:
{
params.type_v,
!cparams.flash_attn,
cparams.offload_kqv,
- cparams.n_ctx,
params.swa_full,
+ cparams.n_ctx,
cparams.n_seq_max,
cparams.n_batch,
padding);
!cparams.flash_attn,
cparams.offload_kqv,
cparams.n_ctx,
+ cparams.n_seq_max,
padding,
hparams.n_swa,
hparams.swa_type);
// Function to tokenize the prompt
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
- const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
+ const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
prompt_tokens.resize(n_prompt_tokens);
// Check if we have enough space in the context to evaluate this batch
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
const int n_ctx = llama_n_ctx(ctx.get());
- const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
+ const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf(LOG_COL_DEFAULT "\n");
printe("context size exceeded\n");
int n_tasks_deferred;
int64_t t_start;
- int32_t kv_cache_tokens_count;
- int32_t kv_cache_used_cells;
-
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
uint64_t n_prompt_tokens_processed_total = 0;
uint64_t t_prompt_processing_total = 0;
{ "n_decode_total", n_decode_total },
{ "n_busy_slots_total", n_busy_slots_total },
- { "kv_cache_tokens_count", kv_cache_tokens_count },
- { "kv_cache_used_cells", kv_cache_used_cells },
-
{ "slots", slots_data },
};
}
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
res->t_start = metrics.t_start;
- res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
- res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
-
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
{"name", "predicted_tokens_seconds"},
{"help", "Average generation throughput in tokens/s."},
{"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
- },{
- {"name", "kv_cache_usage_ratio"},
- {"help", "KV-cache usage. 1 means 100 percent usage."},
- {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx}
- },{
- {"name", "kv_cache_tokens"},
- {"help", "KV-cache tokens."},
- {"value", (uint64_t) res_metrics->kv_cache_tokens_count}
},{
{"name", "requests_processing"},
{"help", "Number of requests processing."},