llama : deprecate explicit kv_self defrag/update calls (#13921)

author Georgi Gerganov <redacted>

Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)

committer GitHub <redacted>

Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)
author Georgi Gerganov <redacted>
Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)
committer GitHub <redacted>
Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp

index 347ea4a698f2e2d362faf5084a8198bc2c2e85eb..5ac881b45e26895c622afd5324308f36ca548780 100644 (file)
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
              const int ib = i/n_batch - 1;
              const int bd = n_batch_grp*(n_grp - 1);
  
-            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_self_update  (ctx);
+            llama_kv_self_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
  
              n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
          }
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {
  
          llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
          llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_self_defrag (ctx);
-        llama_kv_self_update (ctx);
  
          n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
  
@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {
  
              llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
              llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_self_defrag (ctx);
-            llama_kv_self_update (ctx);
  
              n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
          }
diff --git a/include/llama.h b/include/llama.h

index 6e13358bbbd963554fd32d8a542d737c83f96b04..da0f652cfd63a409014cae7bff57956611f9e817 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -655,7 +655,6 @@ extern "C" {
      // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
      // If the KV cache is RoPEd, the KV data is updated accordingly:
      //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
      // p0 < 0 : [0,  p1]
      // p1 < 0 : [p0, inf)
      LLAMA_API void llama_kv_self_seq_add(
@@ -668,7 +667,6 @@ extern "C" {
      // Integer division of the positions by factor of `d > 1`
      // If the KV cache is RoPEd, the KV data is updated accordingly:
      //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
      // p0 < 0 : [0,  p1]
      // p1 < 0 : [p0, inf)
      LLAMA_API void llama_kv_self_seq_div(
@@ -696,16 +694,15 @@ extern "C" {
      // Defragment the KV cache
      // This will be applied:
      //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    // TODO: deprecate and always update the cache lazily [TAG: API_KV_NO_DEFRAG]
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
  
      // Check if the context supports KV cache shifting
      LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
  
      // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    // TODO: deprecate and always update the cache lazily [TAG: API_KV_NO_DEFRAG]
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
  
      //
      // State / sessions
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index d913497675d616695beb91ffee9e9e34142012ca..4ab57438794005fb3416b792156994a48b70d448 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2281,6 +2281,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
      return ctx->get_kv_self();
  }
  
+// deprecated
  void llama_kv_self_update(llama_context * ctx) {
      ctx->kv_self_update();
  }
@@ -2535,6 +2536,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
      return kv->seq_pos_max(seq_id);
  }
  
+// deprecated
  void llama_kv_self_defrag(llama_context * ctx) {
      auto * kv = ctx->get_kv_self();
      if (!kv) {
author	Georgi Gerganov <redacted>
	Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)
committer	GitHub <redacted>
	Sat, 31 May 2025 12:58:33 +0000 (15:58 +0300)
examples/passkey/passkey.cpp		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history