void llama_kv_cache_unified::set_full() {
n = size;
+
+ // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
+ // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
+ // we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
+ // setting it to 0 is the simplest way to achieve that
+ // ref: https://github.com/ggml-org/llama.cpp/issues/13359
+ head = 0;
}
llama_sbatch llama_kv_cache_unified::sbatch_init(
void llama_kv_cache_recurrent::set_full() {
n = size;
+ head = 0;
}
llama_sbatch llama_kv_cache_recurrent::sbatch_init(
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
- // Note: The value of head isn't only used to optimize searching
- // for a free KV slot. llama_decode_impl also uses it, so it
- // cannot be freely changed after a slot has been allocated.
- uint32_t head = 0;
- uint32_t size = 0;
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+ uint32_t size = 0; // total number of cells, shared across all sequences
uint32_t used = 0; // used cells (i.e. at least one seq_id)
// computed before each graph build
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
- // Note: The value of head isn't only used to optimize searching
- // for a free KV slot. llama_decode_impl also uses it, so it
- // cannot be freely changed after a slot has been allocated.
- uint32_t head = 0;
- uint32_t size = 0;
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+ uint32_t size = 0; // total number of cells, shared across all sequences
uint32_t used = 0; // used cells (i.e. at least one seq_id)
// computed before each graph build