llama : remove deprecated llama_kv_self API (#15472)

author Georgi Gerganov <redacted>

Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)

committer GitHub <redacted>

Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)
author Georgi Gerganov <redacted>
Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)
committer GitHub <redacted>
Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)
diff --git a/include/llama.h b/include/llama.h

index c465ced4ffa01b2b84691db535ec913d05d5e087..662e0971dff2fa71c44bcb75e9a4ef5d49a1fb44 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -663,111 +663,6 @@ extern "C" {
      // Check if the memory supports shifting
      LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
  
-    //
-    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-    //
-
-    // Returns the number of tokens in the KV cache (slow, use only for debug)
-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-
-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-
-    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    DEPRECATED(LLAMA_API void llama_kv_self_clear(
-                struct llama_context * ctx),
-            "Use llama_memory_clear() instead");
-
-    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-    // seq_id < 0 : match any sequence
-    // p0 < 0     : [0,  p1]
-    // p1 < 0     : [p0, inf)
-    DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "Use llama_memory_seq_rm() instead");
-
-    // Copy all tokens that belong to the specified sequence to another sequence
-    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "Use llama_memory_seq_cp() instead");
-
-    // Removes all tokens that do not belong to the specified sequence
-    DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "Use llama_memory_seq_keep() instead");
-
-    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta),
-            "Use llama_memory_seq_add() instead");
-
-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d),
-            "Use llama_memory_seq_div() instead");
-
-    // Returns the smallest position present in the KV cache for the specified sequence
-    // This is typically non-zero only for SWA caches
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-    // Return -1 if the sequence is empty
-    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "Use llama_memory_seq_pos_min() instead");
-
-    // Returns the largest position present in the KV cache for the specified sequence
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-    // Return -1 if the sequence is empty
-    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "Use llama_memory_seq_pos_max() instead");
-
-    // Defragment the KV cache
-    // This will be applied:
-    //   - lazily on next llama_decode()
-    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-
-    // Check if the context supports KV cache shifting
-    DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
-            "use llama_memory_can_shift() instead");
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-            "simply remove this call, updates are applied lazily on the next llama_decode()");
-
      //
      // State / sessions
      //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index fb6fbe982c663a56869d7af096c14bcdf3e57a70..e8e8b3450a5d2b5bd59c0ab232e260031758c367 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,7 +93,7 @@ llama_context::llama_context(
      // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
      // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
      // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
      if (cparams.n_batch < GGML_KQ_MASK_PAD) {
          LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
          cparams.n_batch = GGML_KQ_MASK_PAD;
@@ -439,26 +439,12 @@ llama_memory_t llama_context::get_memory() const {
      return memory.get();
  }
  
-// deprecated
-void llama_context::kv_self_defrag_sched() {
-    if (!memory) {
-        return;
-    }
-
-    memory_force_optimize = true;
-}
-
-// deprecated
-bool llama_context::kv_self_update(bool optimize) {
+bool llama_context::memory_update(bool optimize) {
      if (!memory) {
          return false;
      }
  
      {
-        // TODO: remove in the future
-        optimize |= memory_force_optimize;
-        memory_force_optimize = false;
-
          const auto mctx = memory->init_update(this, optimize);
          switch (mctx->get_status()) {
              case LLAMA_MEMORY_STATUS_SUCCESS:
@@ -993,7 +979,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
      bool did_optimize = false;
  
      // handle any pending defrags/shifts
-    kv_self_update(false);
+    memory_update(false);
  
      llama_memory_context_ptr mctx;
  
@@ -1018,7 +1004,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                      if (!did_optimize) {
                          did_optimize = true;
  
-                        if (kv_self_update(true)) {
+                        if (memory_update(true)) {
                              LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
  
                              continue;
@@ -2338,11 +2324,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
      return &ctx->get_model();
  }
  
-// deprecated
-void llama_kv_self_update(llama_context * ctx) {
-    ctx->kv_self_update(false);
-}
-
  enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
      return ctx->pooling_type();
  }
@@ -2560,168 +2541,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
      return mem->get_can_shift();
  }
  
-//
-// kv cache
-//
-
-// deprecated
-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    const auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return 0;
-    }
-
-    int32_t res = 0;
-
-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
-        const llama_pos p0 = kv->seq_pos_min(s);
-        const llama_pos p1 = kv->seq_pos_max(s);
-
-        if (p0 >= 0) {
-            res += (p1 - p0) + 1;
-        }
-    }
-
-    return res;
-}
-
-// deprecated
-// note: this is the same as above - will be removed anyway, so it's ok
-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    const auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return 0;
-    }
-
-    int32_t res = 0;
-
-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
-        const llama_pos p0 = kv->seq_pos_min(s);
-        const llama_pos p1 = kv->seq_pos_max(s);
-
-        if (p0 >= 0) {
-            res += (p1 - p0) + 1;
-        }
-    }
-
-    return res;
-}
-
-// deprecated
-void llama_kv_self_clear(llama_context * ctx) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return;
-    }
-
-    llama_memory_clear(kv, true);
-}
-
-// deprecated
-bool llama_kv_self_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return true;
-    }
-
-    return llama_memory_seq_rm(kv, seq_id, p0, p1);
-}
-
-// deprecated
-void llama_kv_self_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return;
-    }
-
-    llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
-}
-
-// deprecated
-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return;
-    }
-
-    llama_memory_seq_keep(kv, seq_id);
-}
-
-// deprecated
-void llama_kv_self_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return;
-    }
-
-    llama_memory_seq_add(kv, seq_id, p0, p1, delta);
-}
-
-// deprecated
-void llama_kv_self_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return;
-    }
-
-    llama_memory_seq_div(kv, seq_id, p0, p1, d);
-}
-
-// deprecated
-llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return -1;
-    }
-
-    return llama_memory_seq_pos_min(kv, seq_id);
-}
-
-// deprecated
-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return -1;
-    }
-
-    return llama_memory_seq_pos_max(kv, seq_id);
-}
-
-// deprecated
-void llama_kv_self_defrag(llama_context * ctx) {
-    // force defrag
-    ctx->kv_self_defrag_sched();
-}
-
-// deprecated
-bool llama_kv_self_can_shift(const llama_context * ctx) {
-    auto * kv = llama_get_memory(ctx);
-    if (!kv) {
-        return false;
-    }
-
-    return llama_memory_can_shift(kv);
-}
-
  // llama state API
  
  // deprecated
diff --git a/src/llama-context.h b/src/llama-context.h

index 230ef8962b8fa41113f0c38d94798a1774a05f35..3dd9205446483eb424b7b191ceaf41fc77f012d1 100644 (file)
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -46,10 +46,8 @@ struct llama_context {
  
      llama_memory_t get_memory() const;
  
-    // return true of the KV cache was updated
-    // TODO: remove
-    bool kv_self_update(bool optimize);
-    void kv_self_defrag_sched();
+    // return true if the memory was updated
+    bool memory_update(bool optimize);
  
      enum llama_pooling_type pooling_type() const;
  
@@ -230,9 +228,6 @@ private:
  
      std::unique_ptr<llama_memory_i> memory;
  
-    // TODO: temporary, until the llama_kv_self_defrag() API is removed
-    bool memory_force_optimize = false;
-
      // decode output (2-dimensional array: [n_outputs][n_vocab])
      size_t  logits_size = 0; // capacity (of floats) for logits
      float * logits      = nullptr;
author	Georgi Gerganov <redacted>
	Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)
committer	GitHub <redacted>
	Thu, 21 Aug 2025 16:13:45 +0000 (19:13 +0300)
include/llama.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-context.h		patch \| blob \| history