llama : remove llama_kv_cache_view API + remove deprecated (#13653)

author Georgi Gerganov <redacted>

Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)

committer GitHub <redacted>

Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)
author Georgi Gerganov <redacted>
Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)
committer GitHub <redacted>
Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)
diff --git a/common/arg.cpp b/common/arg.cpp

index e2676bb878e282117adc7f42e5c93bd3e15ace6f..b1754f30fca91a3eebd40dda34d8296f080c6651 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1452,7 +1452,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          [](common_params & params) {
              params.swa_full = true;
          }
-    ));
+    ).set_env("LLAMA_ARG_SWA_FULL"));
      add_opt(common_arg(
          {"--no-context-shift"},
          string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2065,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.grp_attn_w = value;
          }
      ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(common_arg(
-        {"-dkvc", "--dump-kv-cache"},
-        "verbose print of the KV cache",
-        [](common_params & params) {
-            params.dump_kv_cache = true;
-        }
-    ));
      add_opt(common_arg(
          {"-nkvo", "--no-kv-offload"},
          "disable KV offload",
diff --git a/common/common.cpp b/common/common.cpp

index e76dfad58547e7503e72d447561c5f7fcf643f7b..eb16055ea6448bbad88dba74fe9618bde7263f9b 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1329,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
      return text;
  }
  
-//
-// KV cache utils
-//
-
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        int seq_count = 0;
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) { seq_count++; }
-        }
-        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] < 0) { continue; }
-            if (seqs.find(cs_curr[j]) == seqs.end()) {
-                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                const size_t sz = seqs.size();
-                seqs[cs_curr[j]] = sz;
-            }
-        }
-        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-    }
-
-    printf("=== Sequence legend: ");
-    for (const auto & it : seqs) {
-        printf("%zu=%d, ", it.second, it.first);
-    }
-    printf("'+'=other sequence ids");
-
-    c_curr = view.cells;
-    cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) {
-                const auto & it = seqs.find(cs_curr[j]);
-                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
-            } else {
-                putchar('.');
-            }
-        }
-        putchar(' ');
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
  //
  // Embedding utils
  //
diff --git a/common/common.h b/common/common.h

index ee15c67cb2671963375aae5300fc15abfb548206..556ff5be4079843bf44f35425847e6cca85d8ef2 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -330,7 +330,6 @@ struct common_params {
      bool use_mlock         = false; // use mlock to keep model in memory
      bool verbose_prompt    = false; // print prompt tokens before generation
      bool display_prompt    = true;  // print prompt before generation
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
      bool no_kv_offload     = false; // disable KV offloading
      bool warmup            = true;  // warmup run
      bool check_tensors     = false; // validate tensor data
@@ -622,16 +621,6 @@ std::string common_detokenize(
          const std::vector<llama_token> & tokens,
                                    bool   special = true);
  
-//
-// KV cache utils
-//
-
-// Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
-
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
-
  //
  // Embedding utils
  //
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp

index 7df20aee170466d4d474846b1a0ed20580a42564..5f8620973f40e3037ef2c3e0b5f8a2ac9ae58915 100644 (file)
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
      const int N = 5;  // n-gram size
      const int G = 15; // max verification n-grams
  
-    const bool dump_kv_cache = params.dump_kv_cache;
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
      // here we keep adding new n-grams as we go
      ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
  
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
-
      const auto t_dec_start = ggml_time_us();
  
      // sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
      }
  
      while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
          // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
          //
          // Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
  
      common_sampler_free(smpl);
  
-    llama_kv_cache_view_free(&kvc_view);
-
      llama_batch_free(batch);
  
      llama_backend_free();
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp

index 4ae93b2a5ed15c08625f66626b38b71113fce383..2ee502939d55462a92fdf424e7fdb0aa13b6bec0 100644 (file)
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
      // max. number of additional tokens to draft if match is found
      const int n_draft = params.speculative.n_max;
  
-    const bool dump_kv_cache = params.dump_kv_cache;
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
  
      llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
  
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
-
      const auto t_dec_start = ggml_time_us();
  
      while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
          // print current draft sequence
          LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
  
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp

index b967731a2153cec2d25eef70b03e660c34e28ee8..acb1301a2b6194e1f7114569a30f6cfd1efd1177 100644 (file)
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
      // insert new requests as soon as the previous one is done
      const bool cont_batching = params.cont_batching;
  
-    const bool dump_kv_cache = params.dump_kv_cache;
-
      // is the system prompt shared in the cache
      const bool is_sp_shared = params.is_pp_shared;
  
@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
      int32_t n_total_gen    = 0;
      int32_t n_cache_miss   = 0;
  
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
-
      const auto t_main_start = ggml_time_us();
  
      LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
      LOG_INF("Processing requests ...\n\n");
  
      while (true) {
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
          common_batch_clear(batch);
  
          // decode any currently ongoing sequences
diff --git a/include/llama.h b/include/llama.h

index 1064f89466256415e22ce1cc909211e6e6df5e0f..6b4fc5d1179af4c8fbcf10402b5c6dde37a66a4d 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -608,72 +608,13 @@ extern "C" {
      // KV cache
      //
  
-    // TODO: start using struct llama_kv_cache
-
-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_seq_max;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_seq_max items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    ///
-
      // Returns the number of tokens in the KV cache (slow, use only for debug)
      // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
      LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
  
-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "use llama_kv_self_n_tokens instead");
-
      // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
      LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
  
-    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
-            "use llama_kv_self_used_cells instead");
-
      // Clear the KV cache - both cell info is erased and KV data is zeroed
      LLAMA_API void llama_kv_self_clear(
              struct llama_context * ctx);
@@ -756,61 +697,6 @@ extern "C" {
      // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
      LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
  
-    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx),
-            "use llama_kv_self_clear instead");
-
-    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_rm instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1),
-            "use llama_kv_self_seq_cp instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_keep instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta),
-            "use llama_kv_self_seq_add instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d),
-            "use llama_kv_self_seq_div instead");
-
-    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id),
-            "use llama_kv_self_seq_pos_max instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
-            "use llama_kv_self_defrag instead");
-
-    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
-            "use llama_kv_self_can_shift instead");
-
-    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
-            "use llama_kv_self_update instead");
-
-
      //
      // State / sessions
      //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index af0bfbddbd736a9de7e6752a6d4a0d9f77c957af..bba3ee0b50592dfe98f0709d80ad3cd3f532e794 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2288,39 +2288,10 @@ int32_t llama_apply_adapter_cvec(
      return res ? 0 : -1;
  }
  
-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return {};
-    }
-
-    return llama_kv_cache_view_init(*kv, n_seq_max);
-}
-
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    const auto * kv = ctx->get_kv_self();
-    if (kv == nullptr) {
-        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
-        return;
-    }
-
-    llama_kv_cache_view_update(view, kv);
-}
-
  //
  // kv cache
  //
  
-// deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
-
  int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
      const auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2330,11 +2301,6 @@ int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
      return kv->get_n_tokens();
  }
  
-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
-}
-
  int32_t llama_kv_self_used_cells(const llama_context * ctx) {
      const auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2344,11 +2310,6 @@ int32_t llama_kv_self_used_cells(const llama_context * ctx) {
      return kv->get_used_cells();
  }
  
-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
-}
-
  void llama_kv_self_clear(llama_context * ctx) {
      auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2358,15 +2319,6 @@ void llama_kv_self_clear(llama_context * ctx) {
      kv->clear();
  }
  
-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
-
  bool llama_kv_self_seq_rm(
          llama_context * ctx,
           llama_seq_id   seq_id,
@@ -2380,16 +2332,6 @@ bool llama_kv_self_seq_rm(
      return kv->seq_rm(seq_id, p0, p1);
  }
  
-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
-
  void llama_kv_self_seq_cp(
          llama_context * ctx,
           llama_seq_id   seq_id_src,
@@ -2404,13 +2346,6 @@ void llama_kv_self_seq_cp(
      kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
  }
  
-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    llama_kv_self_seq_keep(ctx, seq_id);
-}
-
  void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
      auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2420,16 +2355,6 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
      kv->seq_keep(seq_id);
  }
  
-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
-
  void llama_kv_self_seq_add(
          llama_context * ctx,
           llama_seq_id   seq_id,
@@ -2444,16 +2369,6 @@ void llama_kv_self_seq_add(
      kv->seq_add(seq_id, p0, p1, delta);
  }
  
-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
-}
-
  void llama_kv_self_seq_div(
          llama_context * ctx,
           llama_seq_id   seq_id,
@@ -2477,11 +2392,6 @@ llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
      return kv->seq_pos_min(seq_id);
  }
  
-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
-}
-
  llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
      const auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2491,11 +2401,6 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
      return kv->seq_pos_max(seq_id);
  }
  
-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    llama_kv_self_defrag(ctx);
-}
-
  void llama_kv_self_defrag(llama_context * ctx) {
      auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2506,11 +2411,6 @@ void llama_kv_self_defrag(llama_context * ctx) {
      kv->defrag_sched(-1.0f);
  }
  
-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
-}
-
  bool llama_kv_self_can_shift(const llama_context * ctx) {
      const auto * kv = ctx->get_kv_self();
      if (!kv) {
@@ -2520,11 +2420,6 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
      return kv->get_can_shift();
  }
  
-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
-
  // llama state API
  
  // deprecated
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp

index ea832549f3af8df49316da9fa265f0216e106c95..77b2c0dbf895619f8974412ec27b74d3da5e4e12 100644 (file)
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -2888,38 +2888,3 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
  
      return true;
  }
-
-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
-    llama_kv_cache_view result = {
-        /*.n_cells            = */ 0,
-        /*.n_seq_max          = */ n_seq_max,
-        /*.token_count        = */ 0,
-        /*.used_cells         = */ kv.get_used_cells(),
-        /*.max_contiguous     = */ 0,
-        /*.max_contiguous_idx = */ -1,
-        /*.cells              = */ nullptr,
-        /*.cells_sequences    = */ nullptr,
-    };
-
-    return result;
-}
-
-void llama_kv_cache_view_free(llama_kv_cache_view * view) {
-    if (view->cells != nullptr) {
-        free(view->cells);
-        view->cells = nullptr;
-    }
-    if (view->cells_sequences != nullptr) {
-        free(view->cells_sequences);
-        view->cells_sequences = nullptr;
-    }
-}
-
-void llama_kv_cache_view_update(llama_kv_cache_view * , const llama_kv_cache * ) {
-    // TODO: will be removed soon, keep this for now to avoid too many changes in
-    //       https://github.com/ggml-org/llama.cpp/pull/13194
-}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h

index 256a7d43ed57f04b4e4fa66e504fac69b3f97bc1..bd0485bc6a4ba8111780de96d73ffaaeb018a4ba 100644 (file)
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -534,12 +534,3 @@ private:
      bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
      bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
  };
-
-
-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
-
-void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
author	Georgi Gerganov <redacted>
	Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)
committer	GitHub <redacted>
	Tue, 20 May 2025 13:13:16 +0000 (16:13 +0300)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/lookahead/lookahead.cpp		patch \| blob \| history
examples/lookup/lookup.cpp		patch \| blob \| history
examples/parallel/parallel.cpp		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-kv-cache.cpp		patch \| blob \| history
src/llama-kv-cache.h		patch \| blob \| history