From: Georgi Gerganov Date: Wed, 14 May 2025 20:15:15 +0000 (+0300) Subject: kv-cache : fix out-of-bounds view during reserve graph (#13547) X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e3a9421b78da5c810d812e6348a405f5acc39f34;p=pkg%2Fggml%2Fsources%2Fllama.cpp kv-cache : fix out-of-bounds view during reserve graph (#13547) * kv-cache : fix reserve graph out-of-bounds access ggml-ci * cont : add comment * cont : fix comments [no ci] * cont : more correct comment [no ci] --- diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 3dcad65b..265db252 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) { void llama_kv_cache_unified::set_full() { n = size; + + // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not + // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views. + // we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so + // setting it to 0 is the simplest way to achieve that + // ref: https://github.com/ggml-org/llama.cpp/issues/13359 + head = 0; } llama_sbatch llama_kv_cache_unified::sbatch_init( @@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) { void llama_kv_cache_recurrent::set_full() { n = size; + head = 0; } llama_sbatch llama_kv_cache_recurrent::sbatch_init( diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index bf3b4b6a..e83e12c0 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -171,11 +171,8 @@ public: void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; - // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_impl also uses it, so it - // cannot be freely changed after a slot has been allocated. - uint32_t head = 0; - uint32_t size = 0; + uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) + uint32_t size = 0; // total number of cells, shared across all sequences uint32_t used = 0; // used cells (i.e. at least one seq_id) // computed before each graph build @@ -343,11 +340,8 @@ public: void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; - // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_impl also uses it, so it - // cannot be freely changed after a slot has been allocated. - uint32_t head = 0; - uint32_t size = 0; + uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) + uint32_t size = 0; // total number of cells, shared across all sequences uint32_t used = 0; // used cells (i.e. at least one seq_id) // computed before each graph build