kv-cache : add SWA support (#13194)

author Georgi Gerganov <redacted>

Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)

committer GitHub <redacted>

Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)
author Georgi Gerganov <redacted>
Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)
committer GitHub <redacted>
Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)
diff --git a/common/arg.cpp b/common/arg.cpp

index 305168043c27c2ce68776c292fe8a065dce34c36..e2676bb878e282117adc7f42e5c93bd3e15ace6f 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.n_keep = value;
          }
      ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ));
      add_opt(common_arg(
          {"--no-context-shift"},
          string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
diff --git a/common/common.cpp b/common/common.cpp

index 4c835771fc449cb0eaf965adbc946d024d7be1b7..e76dfad58547e7503e72d447561c5f7fcf643f7b 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1136,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
      cparams.flash_attn        = params.flash_attn;
      cparams.no_perf           = params.no_perf;
      cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;
  
      if (params.reranking) {
          cparams.embeddings    = true;
diff --git a/common/common.h b/common/common.h

index f3cb00be362dabb38a41d0a782971c0cc9293446..ee15c67cb2671963375aae5300fc15abfb548206 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -323,6 +323,7 @@ struct common_params {
      bool flash_attn        = false; // flash attention
      bool no_perf           = false; // disable performance metrics
      bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
  
      bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
      bool use_mmap          = true;  // use mmap for faster loads
diff --git a/include/llama.h b/include/llama.h

index 99e5fba244fcc2902fa9e97a085392820ee3975e..1064f89466256415e22ce1cc909211e6e6df5e0f 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,10 +361,11 @@ extern "C" {
  
          // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
          bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool op_offload;  // whether to offload host tensor operations to device
+        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // use flash attention [EXPERIMENTAL]
+        bool no_perf;     // measure performance timings
+        bool op_offload;  // offload host tensor operations to device
+        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
      };
  
      // model quantization parameters
@@ -730,10 +731,18 @@ extern "C" {
                         llama_pos   p1,
                               int   d);
  
+    // Returns the smallest position present in the KV cache for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
      // Returns the largest position present in the KV cache for the specified sequence
+    // Return -1 if the sequence is empty
      LLAMA_API llama_pos llama_kv_self_seq_pos_max(
              struct llama_context * ctx,
-                     llama_seq_id   seq_id);
+                    llama_seq_id   seq_id);
  
      // Defragment the KV cache
      // This will be applied:
@@ -943,9 +952,12 @@ extern "C" {
      // Requires KV cache.
      // For encode-decoder contexts, processes the batch using the decoder.
      // Positive return values does not mean a fatal error, but rather a warning.
-    //   0 - success
-    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // Upon non-zero return values, the KV cache state is restored to the state before this call
+    //    0 - success
+    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    //    2 - aborted
+    //   -1 - invalid input batch
+    // < -1 - error
      LLAMA_API int32_t llama_decode(
              struct llama_context * ctx,
                struct llama_batch   batch);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index a3b84a6a82e74dd1014f5abb88f6e95f0e3d9fe5..af0bfbddbd736a9de7e6752a6d4a0d9f77c957af 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,6 +93,7 @@ llama_context::llama_context(
      }
  
      cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
      cparams.op_offload = params.op_offload;
  
      const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -176,8 +177,9 @@ llama_context::llama_context(
      // init the memory module
      if (!hparams.vocab_only) {
          llama_memory_params params_mem = {
-            /*.type_k =*/ params.type_k,
-            /*.type_v =*/ params.type_v,
+            /*.type_k   =*/ params.type_k,
+            /*.type_v   =*/ params.type_v,
+            /*.swa_full =*/ params.swa_full,
          };
  
          memory.reset(model.create_memory(params_mem, cparams));
@@ -947,8 +949,6 @@ int llama_context::decode(llama_batch & inp_batch) {
  
          // find KV slot
          if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
              return 1;
          }
  
@@ -2093,6 +2093,7 @@ llama_context_params llama_context_default_params() {
          /*.flash_attn                  =*/ false,
          /*.no_perf                     =*/ true,
          /*.op_offload                  =*/ true,
+        /*.swa_full                    =*/ true,
      };
  
      return result;
@@ -2467,6 +2468,15 @@ void llama_kv_self_seq_div(
      kv->seq_div(seq_id, p0, p1, d);
  }
  
+llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
+    const auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return -1;
+    }
+
+    return kv->seq_pos_min(seq_id);
+}
+
  // deprecated
  llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
      return llama_kv_self_seq_pos_max(ctx, seq_id);
@@ -2475,7 +2485,7 @@ llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
  llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
      const auto * kv = ctx->get_kv_self();
      if (!kv) {
-        return 0;
+        return -1;
      }
  
      return kv->seq_pos_max(seq_id);
@@ -2637,7 +2647,21 @@ int32_t llama_encode(
  int32_t llama_decode(
          llama_context * ctx,
            llama_batch   batch) {
-    const int ret = ctx->decode(batch);
+    int ret = ctx->decode(batch);
+
+    // defrag and try again
+    // TODO: distinguish return code when we are sure that even after defrag there is no space available
+    if (ret == 1) {
+        llama_kv_self_defrag(ctx);
+        ret = ctx->decode(batch);
+
+        if (ret == 1) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+            return ret;
+        }
+    }
+
      if (ret != 0) {
          LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
      }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index b0e3f63597a76d0481b77e949f823cda749fd561..410d2608798b84f6c5756f1aaf7aa5ae397041e4 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -9,33 +9,6 @@
  #include <cmath>
  #include <cstring>
  
-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-    // TODO move to hparams if a T5 variant appears that uses a different value
-    const int64_t max_distance = 128;
-
-    if (bidirectional) {
-        n_buckets >>= 1;
-    }
-
-    const int64_t max_exact = n_buckets >> 1;
-
-    int32_t relative_position = x - y;
-    int32_t relative_bucket = 0;
-
-    if (bidirectional) {
-        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
-    } else {
-        relative_position = -std::min<int32_t>(relative_position, 0);
-    }
-
-    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-
-    return relative_bucket;
-}
-
  void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
      if (ubatch->token) {
          const int64_t n_tokens = ubatch->n_tokens;
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
  
  void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
      if (pos_bucket) {
-        const int64_t n_tokens = ubatch->n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
-        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-
-        int32_t * data = (int32_t *) pos_bucket->data;
-
-        const int64_t n_kv = kv_self->n;
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_kv; ++i) {
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
-                }
-            }
-        }
+        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
      }
  }
  
@@ -403,99 +361,18 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
  }
  
  void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
-    if (self_kq_mask || self_kq_mask_swa) {
-        const int64_t n_kv         = kv_self->n;
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        float * data     = nullptr;
-        float * data_swa = nullptr;
-
-        if (self_kq_mask) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
-            data = (float *) self_kq_mask->data;
-        }
-
-        if (self_kq_mask_swa) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
-            data_swa = (float *) self_kq_mask_swa->data;
-        }
-
-        // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-        // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-        //   Causal mask:
-        //      xxx-------
-        //      xxxx------
-        //      xxxxx-----
-        //   Non-causal mask:
-        //      xxxxx-----
-        //      xxxxx-----
-        //      xxxxx-----
-        // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
-        for (int h = 0; h < 1; ++h) {
-            for (int s = 0; s < n_seqs; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-                for (int j = 0; j < n_seq_tokens; ++j) {
-                    const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
-                    for (int i = 0; i < n_kv; ++i) {
-                        float f;
-                        // mask the token if:
-                        if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
-                            || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
-                        ) {
-                            f = -INFINITY;
-                        } else {
-                            if (hparams.use_alibi) {
-                                f = -std::abs(kv_self->cells[i].pos - pos);
-                            } else {
-                                f = 0.0f;
-                            }
-                        }
-
-                        if (data) {
-                            data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-
-                        // may need to cut off old tokens for sliding window
-                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
-                        if (data_swa) {
-                            if (hparams.n_attn_chunk) {
-                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
-                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
-                                    f = -INFINITY;
-                                }
-                            } else {
-                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                    f = -INFINITY;
-                                }
-                            }
-                            data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-                    }
-                }
-            }
+    if (self_kq_mask) {
+        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
+}
  
-            // mask padded tokens
-            if (data) {
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
+void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
+    if (self_kq_mask) {
+        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
  
-            // mask padded tokens
-            if (data_swa) {
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
-        }
+    if (self_kq_mask_swa) {
+        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
      }
  }
  
@@ -545,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
      n_layer          (hparams.n_layer),
      n_rot            (hparams.n_rot),
      n_ctx            (cparams.n_ctx),
-    n_ctx_per_seq    (cparams.n_ctx / cparams.n_seq_max),
      n_head           (hparams.n_head()),
      n_head_kv        (hparams.n_head_kv()),
      n_embd_head_k    (hparams.n_embd_head_k),
@@ -1153,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
  
      auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
  
-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_self->get_n();
  
      auto & cur = inp->pos_bucket;
  
@@ -1188,16 +1064,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
           ggml_tensor * kq_b,
           ggml_tensor * kq_mask,
           ggml_tensor * v_mla,
-             bool      v_trans,
               float     kq_scale) const {
-  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-  //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-  //const int64_t n_head    = hparams.n_head(il);
-  //const int64_t n_head_kv = hparams.n_head_kv(il);
+    const bool v_trans = v->nb[1] > v->nb[2];
  
-  //const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+    q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
+    v = ggml_permute(ctx0, v, 0, 2, 1, 3);
  
      const auto n_tokens = q->ne[1];
      const auto n_head   = q->ne[2];
@@ -1336,17 +1208,11 @@ ggml_tensor * llm_graph_context::build_attn(
  
      const auto & kq_mask = inp->get_kq_mask();
  
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = k_cur;
+    ggml_tensor * v = v_cur;
  
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
      cb(cur, "kqv_out", il);
  
      if (wo) {
@@ -1369,22 +1235,17 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
  
      auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
  
-    const auto n_kv = kv_self->n;
-
-    inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp->self_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp->self_kq_mask);
-
-    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    {
+        GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
+        GGML_ASSERT(hparams.n_swa == 0         && "Use llama_kv_cache_unified_iswa for SWA");
  
-    if (hparams.n_swa_pattern > 1) {
-        GGML_ASSERT(hparams.n_swa > 0);
+        const auto n_kv = kv_self->get_n();
  
-        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp->self_kq_mask_swa);
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);
  
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
      }
  
      return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
@@ -1409,81 +1270,100 @@ ggml_tensor * llm_graph_context::build_attn(
      ggml_build_forward_expand(gf, v_cur);
  
      const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-    const auto & n_ctx = cparams.n_ctx;
  
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    // store to KV cache
+    {
+        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+    }
  
-    const auto n_tokens = q_cur->ne[2];
+    const auto & kq_mask = inp->get_kq_mask();
  
-    const bool v_trans = !cparams.flash_attn;
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = kv_self->get_k(ctx0, il);
+    ggml_tensor * v = kv_self->get_v(ctx0, il);
  
-    // store to KV cache
-    {
-        const auto kv_head = kv_self->head;
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    cb(cur, "kqv_out", il);
  
-        GGML_ASSERT(kv_self->size == n_ctx);
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+    }
  
-        ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
-        //cb(k_cache_view, "k_cache_view", il);
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
  
-        // note: storing RoPE-ed version of K in the KV cache
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
+    return cur;
+}
  
-        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
+llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
  
-        ggml_tensor * v_cache_view = nullptr;
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
  
-        if (!v_trans) {
-            v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
-        } else {
-            // note: the V cache is transposed when not using flash attention
-            v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
-                    (  n_ctx)*ggml_element_size(kv_self->v_l[il]),
-                    (kv_head)*ggml_element_size(kv_self->v_l[il]));
+    {
+        const auto n_kv = kv_self->get_kv_base()->get_n();
  
-            v_cur = ggml_transpose(ctx0, v_cur);
-        }
-        //cb(v_cache_view, "v_cache_view", il);
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    }
  
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
+    if (hparams.n_swa_pattern > 1) {
+        GGML_ASSERT(hparams.n_swa > 0          && "Use llama_kv_cache_unified for non-SWA");
+
+        const auto n_kv = kv_self->get_kv_swa()->get_n();
+
+        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp->self_kq_mask_swa);
+
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
      }
  
+    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_kv_unified_iswa * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
      const bool is_swa = hparams.is_swa(il);
  
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+
+    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+
+    // store to KV cache
+    {
+        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+    }
+
      const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
  
-    const auto n_kv = kv_self->n;
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = kv->get_k(ctx0, il);
+    ggml_tensor * v = kv->get_v(ctx0, il);
  
-    const int64_t n_head_kv = hparams.n_head_kv(il);
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-    const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k =
-        ggml_view_3d(ctx0, kv_self->k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
-                0);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = !v_trans ?
-        ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_embd_head_v, n_kv, n_head_kv,
-                ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
-                0) :
-        ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_kv, n_embd_head_v, n_head_kv,
-                ggml_element_size(kv_self->v_l[il])*n_ctx,
-                ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
-                0);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
      cb(cur, "kqv_out", il);
  
      if (wo) {
@@ -1534,17 +1414,11 @@ ggml_tensor * llm_graph_context::build_attn(
  
      const auto & kq_mask = inp->get_kq_mask_cross();
  
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = k_cur;
+    ggml_tensor * v = v_cur;
  
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
      cb(cur, "kqv_out", il);
  
      if (wo) {
@@ -1712,3 +1586,30 @@ void llm_graph_context::build_pooling(
  
      ggml_build_forward_expand(gf, cur);
  }
+
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+
+    const int64_t max_exact = n_buckets >> 1;
+
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+    return relative_bucket;
+}
diff --git a/src/llama-graph.h b/src/llama-graph.h

index 832a8c09f2b80eb816326458d100caa2ca262764..2b85bb25befbac4e9c3294c06d09e641c856a7ef 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -19,6 +19,7 @@ struct llama_cparams;
  
  class llama_memory_i;
  class llama_kv_cache_unified;
+class llama_kv_cache_unified_iswa;
  class llama_kv_cache_recurrent;
  
  // certain models (typically multi-modal) can produce different types of graphs
@@ -255,6 +256,31 @@ public:
  
      void set_input(const llama_ubatch * ubatch) override;
  
+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified_iswa * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+    ~llm_graph_input_attn_kv_unified_iswa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
      ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
      ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
  
@@ -266,7 +292,7 @@ public:
      const llama_hparams & hparams;
      const llama_cparams & cparams;
  
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_iswa * kv_self;
  };
  
  class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -378,7 +404,6 @@ struct llm_graph_context {
      const int64_t n_layer;
      const int64_t n_rot;
      const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
      const int64_t n_head;
      const int64_t n_head_kv;
      const int64_t n_embd_head_k;
@@ -507,13 +532,12 @@ struct llm_graph_context {
  
      ggml_tensor * build_attn_mha(
               ggml_cgraph * gf,
-             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
               ggml_tensor * kq_b,
               ggml_tensor * kq_mask,
-             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                    bool   v_trans,
+             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                     float   kq_scale) const;
  
      llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@@ -546,6 +570,21 @@ struct llm_graph_context {
                    float   kq_scale,
                      int   il) const;
  
+    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
+
      llm_graph_input_attn_cross * build_attn_inp_cross() const;
  
      ggml_tensor * build_attn(
@@ -596,3 +635,6 @@ struct llm_graph_context {
              ggml_tensor * cls_out,
              ggml_tensor * cls_out_b) const;
  };
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h

index 7ee6a5b75ad1ef66a0e3d21a514e257ae6dcecf1..f865cbaea0240383bed276c03ef9eb684ad8014f 100644 (file)
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
      LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
  };
  
+enum llama_swa_type {
+    LLAMA_SWA_TYPE_NONE     = 0,
+    LLAMA_SWA_TYPE_STANDARD = 1,
+    LLAMA_SWA_TYPE_CHUNKED  = 2,
+};
+
  struct llama_hparams_posnet {
      uint32_t n_embd;
      uint32_t n_layer;
@@ -35,8 +41,6 @@ struct llama_hparams {
      uint32_t n_embd_features = 0;
      uint32_t n_layer;
      uint32_t n_rot;
-    uint32_t n_swa = 0; // sliding window attention (SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
      uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
      uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
      uint32_t n_expert = 0;
@@ -96,6 +100,12 @@ struct llama_hparams {
  
      std::array<int, 4> rope_sections;
  
+    // Sliding Window Attention (SWA)
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
+
      // for State Space Models
      uint32_t ssm_d_conv  = 0;
      uint32_t ssm_d_inner = 0;
@@ -116,11 +126,10 @@ struct llama_hparams {
      bool causal_attn   = true;
      bool use_alibi     = false;
      bool attn_soft_cap = false;
+    bool use_kq_norm   = true;
  
+    // llama4
      uint32_t n_moe_layer_step        = 0;
-    bool     use_kq_norm             = true;
-    uint32_t n_attn_chunk            = 0;
-    // values below seems to be fixed on llama4
      uint32_t n_no_rope_layer_step    = 4;
      uint32_t n_attn_temp_floor_scale = 8192;
      float    f_attn_temp_scale       = 0.1;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp

index 265db2527c7ca5d012cd5c305bd8ecefdbba983a..ea832549f3af8df49316da9fa265f0216e106c95 100644 (file)
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -23,40 +23,28 @@ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
  }
  
  llama_kv_cache_unified::llama_kv_cache_unified(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   v_trans,
-                     bool   offload,
-                 uint32_t   kv_size,
-                 uint32_t   padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
-    const int32_t n_layer = hparams.n_layer;
-
-    has_shift = false;
-    can_shift = true;
-
-    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
-            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
-
+        const llama_model &  model,
+          layer_filter_cb && filter,
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                     bool    offload,
+                 uint32_t    kv_size,
+                 uint32_t    padding,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
      GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
  
-    head = 0;
-    size = kv_size;
-    used = 0;
-
      this->type_k = type_k;
      this->type_v = type_v;
  
-    cells.clear();
-    cells.resize(kv_size);
-
      // create a context for each buffer type
      std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
      auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
          auto it = ctx_map.find(buft);
          if (it == ctx_map.end()) {
              ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
                  /*.mem_buffer =*/ NULL,
                  /*.no_alloc   =*/ true,
              };
@@ -75,37 +63,50 @@ llama_kv_cache_unified::llama_kv_cache_unified(
          return it->second;
      };
  
-    k_l.reserve(n_layer);
-    v_l.reserve(n_layer);
+    head = 0;
+    size = kv_size;
+    used = 0;
  
-    for (int i = 0; i < n_layer; i++) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+    cells.resize(kv_size);
+
+    for (uint32_t il = 0; il < hparams.n_layer; il++) {
+        if (filter && !filter(il)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+            continue;
+        }
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  
          const char * dev_name = "CPU";
  
          ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
  
          if (offload) {
-            auto * dev = model.dev_layer(i);
+            auto * dev = model.dev_layer(il);
              buft = ggml_backend_dev_buffer_type(dev);
  
              dev_name = ggml_backend_dev_name(dev);
          }
  
-        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
+        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
  
          ggml_context * ctx = ctx_for_buft(buft);
          if (!ctx) {
              throw std::runtime_error("failed to create ggml context for kv cache");
          }
  
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-        ggml_format_name(k, "cache_k_l%d", i);
-        ggml_format_name(v, "cache_v_l%d", i);
-        k_l.push_back(k);
-        v_l.push_back(v);
+        ggml_tensor * k;
+        ggml_tensor * v;
+
+        k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
+        v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
+
+        ggml_format_name(k, "cache_k_l%d", il);
+        ggml_format_name(v, "cache_v_l%d", il);
+
+        map_layer_ids[il] = layers.size();
+        layers.push_back({ il, k, v });
      }
  
      // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -117,8 +118,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
          if (!buf) {
              throw std::runtime_error("failed to allocate buffer for kv cache");
          }
-        ggml_backend_buffer_clear(buf, 0);
+
          LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+        ggml_backend_buffer_clear(buf, 0);
          bufs.emplace_back(buf);
      }
  
@@ -126,18 +129,19 @@ llama_kv_cache_unified::llama_kv_cache_unified(
          const size_t memory_size_k = size_k_bytes();
          const size_t memory_size_v = size_v_bytes();
  
-        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
                  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
      }
  }
  
  void llama_kv_cache_unified::clear() {
-    for (int32_t i = 0; i < (int32_t) size; ++i) {
+    for (uint32_t i = 0; i < size; ++i) {
          cells[i].pos = -1;
          cells[i].seq_id.clear();
      }
+
      head = 0;
      used = 0;
  
@@ -166,6 +170,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
              } else {
                  continue;
              }
+
              if (cells[i].is_empty()) {
                  // keep count of the number of used cells
                  if (cells[i].pos >= 0) {
@@ -262,6 +267,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
      for (uint32_t i = 0; i < size; ++i) {
          if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
              has_shift = true;
+
              cells[i].pos   += delta;
              cells[i].delta += delta;
  
@@ -314,53 +320,60 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
      }
  }
  
-llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
-    llama_pos result = 0;
+llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
+    llama_pos result = std::numeric_limits<llama_pos>::max();
  
      for (uint32_t i = 0; i < size; ++i) {
          if (cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cells[i].pos);
+            result = std::min(result, cells[i].pos);
          }
      }
  
+    if (result == std::numeric_limits<llama_pos>::max()) {
+        result = -1;
+    }
+
      return result;
  }
  
-void llama_kv_cache_unified::restore() {
-    if (pending.ranges.empty()) {
-        return;
-    }
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos result = -1;
  
-    uint32_t new_head = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
  
-    for (auto & range : pending.ranges) {
-        for (uint32_t i = range.c0; i < range.c1; ++i) {
-            cells[i].seq_id.clear();
+    return result;
+}
  
-            // keep count of the number of used cells
-            if (cells[i].pos >= 0) {
-                used--;
-            }
+void llama_kv_cache_unified::restore() {
+    for (const auto & [id, cell] : recovery.cells) {
+        // TODO: move to new `struct kv_cells`
+        const bool is_empty0 = cells[id].is_empty();
+        const bool is_empty1 = cell.is_empty();
  
-            cells[i].pos = -1;
+        if (!is_empty0 && is_empty1) {
+            used--;
+        } else if (is_empty0 && !is_empty1) {
+            used++;
          }
  
-        new_head = std::min(new_head, range.c0);
+        cells[id] = cell;
      }
  
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
+    recovery.clear();
  }
  
  void llama_kv_cache_unified::commit() {
-    if (pending.ranges.empty()) {
-        LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
-                __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
+    if (recovery.cells.empty()) {
+        LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
          return;
      }
  
-    pending.ranges.clear();
+    recovery.clear();
  }
  
  bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -450,25 +463,17 @@ void llama_kv_cache_unified::set_full() {
      head = 0;
  }
  
-llama_sbatch llama_kv_cache_unified::sbatch_init(
-        const llama_batch & batch,
-        bool logits_all) {
+llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
      return llama_sbatch(batch, hparams.n_embd, true, logits_all);
  }
  
-llama_ubatch llama_kv_cache_unified::ubatch_next(
-        llama_sbatch & sbatch,
-        uint32_t n_ubatch,
-        bool embd_pooled) const {
+llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
      GGML_UNUSED(embd_pooled);
      return sbatch.split_simple(n_ubatch);
  }
  
-bool llama_kv_cache_unified::find_slot(
-       const llama_ubatch & ubatch) {
+bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
      const uint32_t n_tokens = ubatch.n_tokens;
-    const uint32_t n_seqs   = ubatch.n_seqs;
-    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
  
      // if we have enough unused cells before the current head ->
      //   better to start searching from the beginning of the cache, hoping to fill it
@@ -483,6 +488,29 @@ bool llama_kv_cache_unified::find_slot(
          return false;
      }
  
+//#define FIND_SLOT_DEBUG 1
+#if FIND_SLOT_DEBUG
+    LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
+
+    // for debugging
+    {
+        std::string ss;
+        if (n_swa > 0) {
+            for (uint32_t i = 0; i < size; ++i) {
+                if (cells[i].pos == -1) {
+                    ss += '.';
+                } else {
+                    ss += std::to_string(*cells[i].seq_id.begin());
+                }
+                if (i%256 == 255) {
+                    ss += '\n';
+                }
+            }
+        }
+        LLAMA_LOG_WARN("\n%s\n", ss.c_str());
+    }
+#endif
+
      uint32_t n_tested = 0;
  
      while (true) {
@@ -512,27 +540,29 @@ bool llama_kv_cache_unified::find_slot(
          }
      }
  
-    for (uint32_t s = 0; s < n_seqs; s++) {
-        for (uint32_t i = 0; i < n_seq_tokens; ++i) {
-            uint32_t k = s*n_seq_tokens + i;
-            cells[head + k].pos = ubatch.pos[k];
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        // remember the original state
+        if (recovery.cells.find(head + i) == recovery.cells.end()) {
+            recovery.cells[head + i] = cells[head + i];
+        }
+
+        cells[head + i].pos = ubatch.pos[i];
  
-            for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
-                cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
-            }
+        for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
+            cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
          }
      }
  
      used += n_tokens;
  
-    pending.ranges.push_back({head, head + n_tokens});
-
      // a heuristic, to avoid attending the full cache if it is not yet utilized
      // after enough generations, the benefit from this heuristic disappears
      // if we start defragmenting the cache, the benefit from this will be more important
      n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
  
-    //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
+#ifdef FIND_SLOT_DEBUG
+    LLAMA_LOG_WARN("end:   n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
+#endif
  
      return true;
  }
@@ -552,11 +582,229 @@ int32_t llama_kv_cache_unified::get_used_cells() const {
  }
  
  bool llama_kv_cache_unified::get_can_shift() const {
-    return can_shift;
+    return true;
+}
+
+uint32_t llama_kv_cache_unified::get_n() const {
+    return n;
+}
+
+uint32_t llama_kv_cache_unified::get_size() const {
+    return size;
+}
+
+ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    return ggml_view_3d(ctx, k,
+            hparams.n_embd_head_k, hparams.n_head_kv(il), n,
+            ggml_row_size(k->type, hparams.n_embd_head_k),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    if (!v_trans) {
+        // note: v->nb[1] <= v->nb[2]
+        return ggml_view_3d(ctx, v,
+                hparams.n_embd_head_v, hparams.n_head_kv(il), n,
+                ggml_row_size(v->type, hparams.n_embd_head_v),    // v->nb[1]
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
+                0);
+    }
+
+    // note: v->nb[1] > v->nb[2]
+    return ggml_view_3d(ctx, v,
+            n, hparams.n_head_kv(il), hparams.n_embd_head_v,
+            ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
+            ggml_row_size(v->type, v->ne[1]),                       // v->nb[2]
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    const int64_t n_tokens = k_cur->ne[2];
+
+    ggml_tensor * k_view = ggml_view_1d(ctx, k,
+            n_tokens*hparams.n_embd_k_gqa(il),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
+
+    return ggml_cpy(ctx, k_cur, k_view);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    const int64_t n_tokens = v_cur->ne[2];
+
+    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
+
+    ggml_tensor * v_view = nullptr;
+
+    if (!v_trans) {
+        v_view = ggml_view_1d(ctx, v,
+                n_tokens*hparams.n_embd_v_gqa(il),
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
+                (v->ne[1])*ggml_element_size(v),
+                (    head)*ggml_element_size(v));
+
+        v_cur = ggml_transpose(ctx, v_cur);
+    }
+
+    return ggml_cpy(ctx, v_cur, v_view);
+}
+
+void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
+    // no pruning is needed when the cache does not use SWA
+    GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
+
+    int n_attended = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        const llama_pos p0 = cells[i].pos;
+
+        if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
+            n_attended++;
+        }
+
+        if (is_masked_swa(p0, pmax)) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+            }
+        }
+    }
+
+    if (n_attended < std::min<int>(n_swa, pmin)) {
+        LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
+    }
+}
+
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+    const int64_t n_tokens     = ubatch->n_tokens;
+    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+    const int64_t n_seqs       = ubatch->n_seqs;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    float * data = (float *) dst->data;
+
+    const int64_t n_kv = n;
+
+    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+    //   Causal mask:
+    //      xxx-------
+    //      xxxx------
+    //      xxxxx-----
+    //   Non-causal mask:
+    //      xxxxx-----
+    //      xxxxx-----
+    //      xxxxx-----
+    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+    for (int h = 0; h < 1; ++h) {
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            for (int j = 0; j < n_seq_tokens; ++j) {
+                const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    const llama_pos p0 = cells[i].pos;
+
+                    bool masked = false;
+
+                    // mask the token if not the same sequence
+                    masked = masked || (!cells[i].has_seq_id(seq_id));
+
+                    // mask future tokens
+                    masked = masked || (causal_attn && p0 > p1);
+
+                    // apply SWA if any
+                    masked = masked || (is_masked_swa(p0, p1));
+
+                    float f = 0.0f;
+
+                    if (masked) {
+                        f = -INFINITY;
+                    } else if (hparams.use_alibi) {
+                        f = -std::abs(p0 - p1);
+                    }
+
+                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                }
+            }
+        }
+
+        // mask padded tokens
+        if (data) {
+            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (int j = 0; j < n_kv; ++j) {
+                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+    int32_t * data = (int32_t *) dst->data;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        data[i] = cells[i].delta;
+    }
+}
+
+void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+    int32_t * data = (int32_t *) dst->data;
+
+    const int64_t n_kv = n;
+
+    for (int h = 0; h < 1; ++h) {
+        for (int j = 0; j < n_tokens; ++j) {
+            for (int i = 0; i < n_kv; ++i) {
+                data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
+            }
+        }
+    }
  }
  
  llama_pos llama_kv_cache_unified::get_pos_max() const {
      llama_pos pos_max = -1;
+
      for (const auto & cell : cells) {
          pos_max = std::max(pos_max, cell.pos);
      }
@@ -566,6 +814,7 @@ llama_pos llama_kv_cache_unified::get_pos_max() const {
  
  size_t llama_kv_cache_unified::total_size() const {
      size_t size = 0;
+
      for (const auto & buf : bufs) {
          size += ggml_backend_buffer_get_size(buf.get());
      }
@@ -576,8 +825,8 @@ size_t llama_kv_cache_unified::total_size() const {
  size_t llama_kv_cache_unified::size_k_bytes() const {
      size_t size_k_bytes = 0;
  
-    for (const auto & k : k_l) {
-        size_k_bytes += ggml_nbytes(k);
+    for (const auto & layer : layers) {
+        size_k_bytes += ggml_nbytes(layer.k);
      }
  
      return size_k_bytes;
@@ -586,8 +835,8 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
  size_t llama_kv_cache_unified::size_v_bytes() const {
      size_t size_v_bytes = 0;
  
-    for (const auto & v : v_l) {
-        size_v_bytes += ggml_nbytes(v);
+    for (const auto & layer : layers) {
+        size_v_bytes += ggml_nbytes(layer.v);
      }
  
      return size_v_bytes;
@@ -651,13 +900,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
      GGML_UNUSED(ubatch);
  
      if (k_shift) {
-        assert(ggml_backend_buffer_is_host(k_shift->buffer));
-
-        int32_t * data = (int32_t *) k_shift->data;
-
-        for (uint32_t i = 0; i < kv_self->size; ++i) {
-            data[i] = kv_self->cells[i].delta;
-        }
+        kv_self->set_input_k_shift(k_shift);
      }
  }
  
@@ -667,13 +910,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
                  ggml_cgraph * gf) const {
      auto res = std::make_unique<llm_graph_result>();
  
-    const auto & n_layer = hparams.n_layer;
-
      const auto & n_embd_head_k = hparams.n_embd_head_k;
    //const auto & n_embd_head_v = hparams.n_embd_head_v;
  
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
      //GGML_ASSERT(kv_self->size == n_ctx);
  
      auto inp = std::make_unique<llm_graph_input_k_shift>(this);
@@ -681,24 +920,22 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
      inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
      ggml_set_input(inp->k_shift);
  
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
          const int64_t n_head_kv    = hparams.n_head_kv(il);
          const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  
-        const bool is_swa = hparams.is_swa(il);
+        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  
-        // note: the swa rope params could become part of the cparams in the future
-        //       if we decide to make them configurable, like the non-sliding ones
-        const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
-        const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
-
-        ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
          ggml_tensor * k =
-            ggml_view_3d(ctx, k_l[il],
+            ggml_view_3d(ctx, layer.k,
                  n_embd_head_k, n_head_kv, size,
-                ggml_row_size(k_l[il]->type, n_embd_head_k),
-                ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(layer.k->type, n_embd_head_k),
+                ggml_row_size(layer.k->type, n_embd_k_gqa),
                  0);
  
          ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
@@ -803,44 +1040,46 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
              nm++;
          }
  
-        for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
              const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
              const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  
-            ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
+            ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
                      n_embd_k_gqa, nm,
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*i));
  
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
                      n_embd_k_gqa, nm,
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*id));
  
              ggml_tensor * view_v_src;
              ggml_tensor * view_v_dst;
  
              if (cparams.flash_attn) {
                  // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx, v_l[il],
+                view_v_src = ggml_view_2d(ctx, layer.v,
                          n_embd_v_gqa, nm,
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*i));
  
-                view_v_dst = ggml_view_2d(ctx, v_l[il],
+                view_v_dst = ggml_view_2d(ctx, layer.v,
                          n_embd_v_gqa, nm,
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*id));
              } else {
-                view_v_src = ggml_view_2d(ctx, v_l[il],
+                view_v_src = ggml_view_2d(ctx, layer.v,
                          nm, n_embd_v_gqa,
-                        ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(v_l[il]->type, i));
+                        ggml_row_size(layer.v->type, size),
+                        ggml_row_size(layer.v->type, i));
  
-                view_v_dst = ggml_view_2d(ctx, v_l[il],
+                view_v_dst = ggml_view_2d(ctx, layer.v,
                          nm, n_embd_v_gqa,
-                        ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(v_l[il]->type, id));
+                        ggml_row_size(layer.v->type, size),
+                        ggml_row_size(layer.v->type, id));
              }
  
              ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
@@ -857,7 +1096,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
  }
  
  bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
-    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_layer = layers.size();
  
      const uint32_t n_kv   = cell_max();
      const uint32_t n_used = used;
@@ -1005,6 +1244,34 @@ uint32_t llama_kv_cache_unified::cell_max() const {
      return 0;
  }
  
+bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
+    if (p0 < 0) {
+        return true;
+    }
+
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+            {
+                if (p1 - p0 >= (int32_t) n_swa) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                if (p0 < pos_chunk_start) {
+                    return true;
+                }
+            } break;
+    }
+
+    return false;
+}
+
  void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
      std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
      uint32_t cell_count = 0;
@@ -1082,7 +1349,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::
  
  void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
      const uint32_t v_trans = this->v_trans ? 1 : 0;
-    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_layer = layers.size();
  
      io.write(&v_trans, sizeof(v_trans));
      io.write(&n_layer, sizeof(n_layer));
@@ -1091,56 +1358,63 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
  
      // Iterate and write all the keys first, each row is a cell
      // Get whole range at a time
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
          const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  
          // Write key type
-        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        const int32_t k_type_i = (int32_t)layer.k->type;
          io.write(&k_type_i, sizeof(k_type_i));
  
          // Write row size of key
-        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
          io.write(&k_size_row, sizeof(k_size_row));
  
          // Read each range of cells of k_size length each into tmp_buf and write out
          for (const auto & range : cell_ranges) {
              const size_t range_size = range.second - range.first;
              const size_t buf_size = range_size * k_size_row;
-            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+            io.write_tensor(layer.k, range.first * k_size_row, buf_size);
          }
      }
  
      if (!v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
              const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  
              // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            const int32_t v_type_i = (int32_t)layer.v->type;
              io.write(&v_type_i, sizeof(v_type_i));
  
              // Write row size of value
-            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
              io.write(&v_size_row, sizeof(v_size_row));
  
              // Read each range of cells of v_size length each into tmp_buf and write out
              for (const auto & range : cell_ranges) {
                  const size_t range_size = range.second - range.first;
                  const size_t buf_size = range_size * v_size_row;
-                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+                io.write_tensor(layer.v, range.first * v_size_row, buf_size);
              }
          }
      } else {
          // When v is transposed, we also need the element size and get the element ranges from each row
          const uint32_t kv_size = size;
-        for (uint32_t il = 0; il < n_layer; ++il) {
+
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
              const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  
              // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            const int32_t v_type_i = (int32_t)layer.v->type;
              io.write(&v_type_i, sizeof(v_type_i));
  
              // Write element size
-            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            const uint32_t v_size_el = ggml_type_size(layer.v->type);
              io.write(&v_size_el, sizeof(v_size_el));
  
              // Write GQA embedding size
@@ -1153,7 +1427,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
                      const size_t range_size = range.second - range.first;
                      const size_t src_offset = (range.first + j * kv_size) * v_size_el;
                      const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor(v_l[il], src_offset, buf_size);
+                    io.write_tensor(layer.v, src_offset, buf_size);
                  }
              }
          }
@@ -1170,8 +1444,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
          llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
  
          batch.n_tokens = cell_count;
-        batch.n_seq_tokens = cell_count;
-        batch.n_seqs = 1;
  
          for (uint32_t i = 0; i < cell_count; ++i) {
              llama_pos pos;
@@ -1186,13 +1458,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
              }
  
              batch.pos[i] = pos;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id[i] = &dest_seq_id;
          }
-        batch.n_seq_id[0] = 1;
-        batch.seq_id[0] = &dest_seq_id;
+
          if (!find_slot(batch)) {
              LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
              return false;
          }
+
          commit();
  
          // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
@@ -1249,11 +1523,12 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
  bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
      uint32_t v_trans;
      uint32_t n_layer;
+
      io.read_to(&v_trans, sizeof(v_trans));
      io.read_to(&n_layer, sizeof(n_layer));
  
-    if (n_layer != hparams.n_layer) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+    if (n_layer != layers.size()) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
          return false;
      }
      if (cell_count > size) {
@@ -1266,13 +1541,15 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
      }
  
      // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
          const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  
          // Read type of key
          int32_t k_type_i_ref;
          io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        const int32_t k_type_i = (int32_t) layer.k->type;
          if (k_type_i != k_type_i_ref) {
              LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
              return false;
@@ -1281,7 +1558,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
          // Read row size of key
          uint64_t k_size_row_ref;
          io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
          if (k_size_row != k_size_row_ref) {
              LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
              return false;
@@ -1289,18 +1566,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
  
          if (cell_count) {
              // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+            ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
          }
      }
  
      if (!this->v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
              const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  
              // Read type of value
              int32_t v_type_i_ref;
              io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            const int32_t v_type_i = (int32_t)layer.v->type;
              if (v_type_i != v_type_i_ref) {
                  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                  return false;
@@ -1309,7 +1588,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
              // Read row size of value
              uint64_t v_size_row_ref;
              io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
              if (v_size_row != v_size_row_ref) {
                  LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
                  return false;
@@ -1317,18 +1596,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
  
              if (cell_count) {
                  // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+                ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
              }
          }
      } else {
          // For each layer, read the values for each cell (transposed)
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
              const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  
              // Read type of value
              int32_t v_type_i_ref;
              io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            const int32_t v_type_i = (int32_t)layer.v->type;
              if (v_type_i != v_type_i_ref) {
                  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                  return false;
@@ -1337,7 +1618,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
              // Read element size of value
              uint32_t v_size_el_ref;
              io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            const size_t v_size_el = ggml_type_size(layer.v->type);
              if (v_size_el != v_size_el_ref) {
                  LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
                  return false;
@@ -1355,7 +1636,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
                  // For each row in the transposed matrix, read the values for the whole cell range
                  for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
                      const size_t dst_offset = (head + j * size) * v_size_el;
-                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                    ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
                  }
              }
          }
@@ -1364,6 +1645,205 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
      return true;
  }
  
+//
+// llama_kv_cache_unified_iswa
+//
+
+llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                 uint32_t   kv_size,
+                     bool   swa_full,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_batch,
+                 uint32_t   padding) : hparams(model.hparams) {
+    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
+    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+
+    const uint32_t size_base = kv_size;
+
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+
+    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
+    if (swa_full) {
+        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+        size_swa = size_base;
+        do_prune = false;
+    }
+
+    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+    kv_base = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_base), type_k, type_v,
+            v_trans, offload, size_base, padding,
+            0, LLAMA_SWA_TYPE_NONE);
+
+    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+    kv_swa = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_swa), type_k, type_v,
+            v_trans, offload, size_swa, padding,
+            hparams.n_swa, hparams.swa_type);
+}
+
+void llama_kv_cache_unified_iswa::clear() {
+    kv_base->clear();
+    kv_swa ->clear();
+}
+
+bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_base->seq_rm(seq_id, p0, p1);
+    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
+    kv_base->seq_keep(seq_id);
+    kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    kv_base->seq_add(seq_id, p0, p1, delta);
+    kv_swa ->seq_add(seq_id, p0, p1, delta);
+}
+
+void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_base->seq_div(seq_id, p0, p1, d);
+    kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
+    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+    return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_swa->seq_pos_max(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::restore() {
+    kv_base->restore();
+    kv_swa ->restore();
+}
+
+void llama_kv_cache_unified_iswa::commit() {
+    kv_base->commit();
+    kv_swa ->commit();
+
+    // slide the attention window, forgetting/pruning old tokens that are outside the window
+    if (do_prune) {
+        for (const auto & [seq_id, entry] : pending.pos) {
+            kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
+        }
+
+    }
+
+    pending.clear();
+}
+
+bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
+    bool res = true;
+
+    res = res & kv_base->update(lctx);
+    res = res & kv_swa ->update(lctx);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
+    kv_base->defrag_sched(thold);
+    kv_swa ->defrag_sched(thold);
+}
+
+void llama_kv_cache_unified_iswa::set_full() {
+    kv_base->set_full();
+    kv_swa ->set_full();
+}
+
+llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
+    pending.clear();
+
+    if (do_prune) {
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            for (int s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+                const llama_pos    pos    = batch.pos[i];
+
+                if (pending.pos.find(seq_id) == pending.pos.end()) {
+                    pending.pos[seq_id].pmin = pos;
+                    pending.pos[seq_id].pmax = pos;
+                } else {
+                    pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
+                    pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
+                }
+            }
+        }
+    }
+
+    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+}
+
+llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+    GGML_UNUSED(embd_pooled);
+    return sbatch.split_simple(n_ubatch);
+}
+
+bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
+    bool res = true;
+
+    res = res & kv_base->find_slot(batch);
+    res = res & kv_swa ->find_slot(batch);
+
+    return res;
+}
+
+int32_t llama_kv_cache_unified_iswa::get_n_tokens()   const {
+    return kv_base->get_n_tokens();
+}
+
+int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
+    return kv_base->get_used_cells();
+}
+
+llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
+    return kv_base->get_pos_max();
+}
+
+bool llama_kv_cache_unified_iswa::get_can_shift() const {
+    return kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    kv_base->state_write(io, seq_id);
+    kv_swa ->state_write(io, seq_id);
+}
+
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    kv_base->state_read(io, seq_id);
+    kv_swa ->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
+    return kv_base.get();
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
+    return kv_swa.get();
+}
+
  //
  // llama_kv_cache_recurrent
  //
@@ -1683,8 +2163,24 @@ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_
      }
  }
  
+llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
+    llama_pos result = std::numeric_limits<llama_pos>::max();
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::min(result, cells[i].pos);
+        }
+    }
+
+    if (result == std::numeric_limits<llama_pos>::max()) {
+        result = -1;
+    }
+
+    return result;
+}
+
  llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
-    llama_pos result = 0;
+    llama_pos result = -1;
  
      for (uint32_t i = 0; i < size; ++i) {
          if (cells[i].has_seq_id(seq_id)) {
@@ -2063,6 +2559,7 @@ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq
      io.read_to(&cell_count, sizeof(cell_count));
  
      bool res = true;
+
      res = res && state_read_meta(io, cell_count, seq_id);
      res = res && state_read_data(io, cell_count);
  
@@ -2422,73 +2919,7 @@ void llama_kv_cache_view_free(llama_kv_cache_view * view) {
      }
  }
  
-void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
-    // TODO: rework this in the future, for now quick hack
-    const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
-    if (kvu == nullptr) {
-        LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
-        return;
-    }
-
-    if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
-        view->n_cells = int32_t(kvu->size);
-        void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
-        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
-        view->cells = (llama_kv_cache_view_cell *)p;
-        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
-        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
-        view->cells_sequences = (llama_seq_id *)p;
-    }
-
-    const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
-    llama_kv_cache_view_cell * c_curr = view->cells;
-    llama_seq_id * cs_curr = view->cells_sequences;
-    int32_t used_cells = 0;
-    int32_t token_count = 0;
-    int32_t curr_contig_idx = -1;
-    uint32_t max_contig = 0;
-    int32_t max_contig_idx = -1;
-
-    for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
-        const size_t curr_size = kv_cells[i].seq_id.size();
-        token_count += curr_size;
-        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
-
-        if (curr_size > 0) {
-            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
-                max_contig = i - curr_contig_idx;
-                max_contig_idx = curr_contig_idx;
-            }
-            curr_contig_idx = -1;
-        } else if (curr_contig_idx < 0) {
-            curr_contig_idx = i;
-        }
-
-        int seq_idx = 0;
-        for (const llama_seq_id it : kv_cells[i].seq_id) {
-            if (seq_idx >= view->n_seq_max) {
-                break;
-            }
-            cs_curr[seq_idx] = it;
-            seq_idx++;
-        }
-        if (seq_idx != 0) {
-            used_cells++;
-        }
-        for (; seq_idx < view->n_seq_max; seq_idx++) {
-            cs_curr[seq_idx] = -1;
-        }
-    }
-    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
-        max_contig_idx = curr_contig_idx;
-        max_contig = kv_cells.size() - curr_contig_idx;
-    }
-    view->max_contiguous = max_contig;
-    view->max_contiguous_idx = max_contig_idx;
-    view->token_count = token_count;
-    view->used_cells = used_cells;
-    if (uint32_t(used_cells) != kvu->used) {
-        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
-            __func__, kvu->used, used_cells);
-    }
+void llama_kv_cache_view_update(llama_kv_cache_view * , const llama_kv_cache * ) {
+    // TODO: will be removed soon, keep this for now to avoid too many changes in
+    //       https://github.com/ggml-org/llama.cpp/pull/13194
  }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h

index e83e12c09f2b1d545c53b45b5c1518ca1ad8001a..256a7d43ed57f04b4e4fa66e504fac69b3f97bc1 100644 (file)
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -8,6 +8,7 @@
  #include "ggml-cpp.h"
  
  #include <set>
+#include <unordered_map>
  #include <vector>
  
  struct llama_cparams;
@@ -40,6 +41,9 @@ struct llama_kv_cache : public llama_memory_i {
      // batch processing
      //
  
+    // =============================================================================================================
+    // TODO: refactor  and simplify this
+
      virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
  
      // different KV caches require different batch splitting strategies
@@ -48,6 +52,8 @@ struct llama_kv_cache : public llama_memory_i {
      // find an empty slot of size "n_tokens" in the cache
      virtual bool find_slot(const llama_ubatch & batch) = 0;
  
+    // =============================================================================================================
+
      // getters
      virtual int32_t   get_n_tokens()   const = 0;
      virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
@@ -87,38 +93,24 @@ private:
  // llama_kv_cache_unified
  //
  
-// TODO: add notion of max sequences
  class llama_kv_cache_unified : public llama_kv_cache {
  public:
-    struct kv_cell {
-        llama_pos pos   = -1;
-        llama_pos delta =  0;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
      static uint32_t get_padding(const llama_cparams & cparams);
  
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
      llama_kv_cache_unified(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   padding);
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    v_trans,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    padding,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);
  
      ~llama_kv_cache_unified() = default;
  
@@ -130,10 +122,11 @@ public:
  
      bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
      void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
      void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
      void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
  
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
      llama_pos seq_pos_max(llama_seq_id seq_id) const override;
  
      //
@@ -150,7 +143,6 @@ public:
      void set_full() override;
  
      llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
      llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
  
      // updates the cache head
@@ -169,29 +161,72 @@ public:
      // state write/load
  
      void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
  
-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-    uint32_t size = 0; // total number of cells, shared across all sequences
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+    //
+    // llama_kv_cache_unified specific API
+    //
  
-    // computed before each graph build
-    uint32_t n = 0;
+    uint32_t get_n() const;
+    uint32_t get_size() const;
  
-    std::vector<kv_cell> cells;
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
  
-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
+    // store k_cur and v_cur in the cache based on the current head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+    void prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax);
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
  
  private:
      const llama_model & model;
      const llama_hparams & hparams;
  
+    struct kv_cell {
+        llama_pos pos   = -1;
+        llama_pos delta =  0;
+
+        // TODO: replace with bitset uint64_t
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+    };
+
      bool has_shift = false;
      bool do_defrag = false;
-
      bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
+
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
+    uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt)
+
+    // computed before each graph build
+    uint32_t n = 0;
  
      // required padding
      uint32_t padding = 1;
@@ -199,9 +234,29 @@ private:
      ggml_type type_k = GGML_TYPE_F16;
      ggml_type type_v = GGML_TYPE_F16;
  
+    // SWA
+    uint32_t n_swa = 0;
+
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
      std::vector<ggml_context_ptr>        ctxs;
      std::vector<ggml_backend_buffer_ptr> bufs;
  
+    std::vector<kv_cell>  cells;  // TODO: replace with `struct kv_cells`
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    // recovery information used to restore the KV cells to their original state in case of a failure
+    struct {
+        void clear() {
+            cells.clear();
+        }
+
+        std::unordered_map<uint32_t, kv_cell> cells;
+    } recovery;
+
      // defrag
      struct {
          std::vector<uint32_t> ids;
@@ -210,17 +265,6 @@ private:
      // return true if cells have been moved
      bool defrag_prepare(int32_t n_max_nodes);
  
-    // commit/restore cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
-
      // find how many cells are currently in use
      uint32_t cell_max() const;
  
@@ -229,6 +273,8 @@ private:
      size_t size_k_bytes() const;
      size_t size_v_bytes() const;
  
+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
      ggml_tensor * build_rope_shift(
              const llama_cparams & cparams,
                     ggml_context * ctx,
@@ -255,6 +301,106 @@ private:
      bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
  };
  
+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+//   upon successful commit, the SWA cache removes old tokens outside the n_swa window
+
+class llama_kv_cache_unified_iswa : public llama_kv_cache {
+public:
+    llama_kv_cache_unified_iswa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                     uint32_t   kv_size,
+                         bool   swa_full,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_batch,
+                     uint32_t   padding);
+
+    ~llama_kv_cache_unified_iswa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    void restore() override;
+    void commit()  override;
+
+    bool update(llama_context & ctx) override;
+
+    void defrag_sched(float thold) override;
+
+    void set_full() override;
+
+    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
+    bool find_slot(const llama_ubatch & batch) override;
+
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified_iswa specific API
+    //
+
+    llama_kv_cache_unified * get_kv_base() const;
+    llama_kv_cache_unified * get_kv_swa () const;
+
+private:
+    const llama_hparams & hparams;
+
+    bool do_prune = true;
+
+    struct {
+        struct entry {
+            llama_pos pmin;
+            llama_pos pmax;
+        };
+
+        void clear() {
+            pos.clear();
+        }
+
+        // used to perform SWA pruning of old tokens
+        std::unordered_map<llama_seq_id, entry> pos;
+    } pending;
+
+    std::unique_ptr<llama_kv_cache_unified> kv_base;
+    std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
  //
  // llama_kv_cache_recurrent
  //
@@ -302,6 +448,7 @@ public:
      void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
      void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
  
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
      llama_pos seq_pos_max(llama_seq_id seq_id) const override;
  
      //
@@ -318,7 +465,6 @@ public:
      void set_full() override;
  
      llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
      llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
  
      bool find_slot(const llama_ubatch & batch) override;
diff --git a/src/llama-memory.h b/src/llama-memory.h

index c7412d5911ed79153a5d8fefe7eb24917901630c..c2571edc715e1c37e51782b22eb6851a8ebe92e6 100644 (file)
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -7,8 +7,8 @@ struct llama_memory_params {
      ggml_type type_k;
      ggml_type type_v;
  
-    // parameters for other types of memory
-    // ...
+    // use full-size SWA cache
+    bool swa_full;
  };
  
  // general concept of LLM memory
@@ -25,6 +25,7 @@ public:
      virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
      virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
  
+    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
      virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
  
      virtual bool get_can_edit() const = 0;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 7fd094b63f26921ee06bae90c3c440b717f1dbee..057f1fc1777fbccadaf5a8d4b07c02166a549c0d 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -571,9 +571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+
+                hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
+                hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
                  hparams.n_swa_pattern = 4;    // pattern: 3 chunked - 1 full
-                hparams.n_attn_chunk  = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
-                hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
  
                  switch (hparams.n_expert) {
                      case 16:  type = LLM_TYPE_17B_16E; break;
@@ -855,20 +856,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
                  if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
                      // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
+                    LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
                      hparams.n_swa = 2047;
                  } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
                      // default value for Phi-3-mini-128k-instruct
-                    // note: this seems incorrect because the window is bigger than the train context?
-                    hparams.n_swa = 262144;
+                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
                  } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
                      // default value for Phi-3-medium-128k-instruct
-                    // note: this seems incorrect because the window is equal to the train context?
-                    hparams.n_swa = 131072;
+                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
                  }
+
                  bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                  if (!found_swa && hparams.n_swa == 0) {
                      throw std::runtime_error("invalid value for sliding_window");
                  }
+
+                if (hparams.n_swa > hparams.n_ctx_train) {
+                    LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
+
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa_pattern = 1;
+                }
              } break;
          case LLM_ARCH_PHIMOE:
              {
@@ -937,6 +960,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              } break;
          case LLM_ARCH_GEMMA2:
              {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                  hparams.n_swa = 4096; // default value of gemma 2
                  hparams.n_swa_pattern = 2;
                  hparams.attn_soft_cap = true;
@@ -955,6 +979,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              } break;
          case LLM_ARCH_GEMMA3:
              {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                  hparams.n_swa_pattern = 6;
  
                  hparams.rope_freq_base_train_swa  = 10000.0f;
@@ -1039,6 +1064,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              } break;
          case LLM_ARCH_COHERE2:
              {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                  hparams.n_swa_pattern = 4;
  
                  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
@@ -4489,7 +4515,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
      return it->second;
  }
  
-ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
+float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+}
+
+float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+}
+
+ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
      // choose long/short freq factors based on the context size
      if (layers[il].rope_freqs != nullptr) {
          return layers[il].rope_freqs;
@@ -4517,21 +4553,174 @@ struct llm_build_llama : public llm_graph_context {
          // inp_pos - contains the positions
          ggml_tensor * inp_pos = build_inp_pos();
  
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network (non-MoE)
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
+struct llm_build_llama_iswa : public llm_graph_context {
+    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
          // temperature tuning
          ggml_tensor * inp_attn_scale = nullptr;
-        if (arch == LLM_ARCH_LLAMA4) {
-            inp_attn_scale = build_inp_attn_scale();
-        }
+        inp_attn_scale = build_inp_attn_scale();
  
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
  
          const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
          for (int il = 0; il < n_layer; ++il) {
              ggml_tensor * inpSA = inpL;
  
-            bool use_rope = arch == LLM_ARCH_LLAMA4
-                ? (il + 1) % hparams.n_no_rope_layer_step != 0
-                : true;
+            const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  
              // norm
              cur = build_norm(inpL,
@@ -4542,7 +4731,7 @@ struct llm_build_llama : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4590,7 +4779,7 @@ struct llm_build_llama : public llm_graph_context {
                  cb(Kcur, "Kcur", il);
                  cb(Vcur, "Vcur", il);
  
-                if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
+                if (use_rope && hparams.use_kq_norm) {
                      // Llama4TextL2Norm
                      Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
                      Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
@@ -4614,23 +4803,7 @@ struct llm_build_llama : public llm_graph_context {
              ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
              cb(ffn_inp, "ffn_inp", il);
  
-            // feed-forward network (non-MoE)
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
-
-            } else if (arch == LLM_ARCH_LLAMA4) {
+            {
                  // llama4 MoE
                  ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
                          model.layers[il].ffn_norm, NULL,
@@ -4660,26 +4833,6 @@ struct llm_build_llama : public llm_graph_context {
  
                  cur = ggml_add(ctx0, moe_out, shexp_out);
                  cb(cur, "ffn_moe_out_merged", il);
-
-            } else {
-                // MoE branch
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        model.layers[il].ffn_gate_exps,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        n_expert, n_expert_used,
-                        LLM_FFN_SILU, true,
-                        false, 0.0,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                        il);
-                cb(cur, "ffn_moe_out", il);
              }
  
              cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4753,7 +4906,7 @@ struct llm_build_deci : public llm_graph_context {
              } else if (n_head > 0) {
                  // self-attention
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7202,8 +7355,8 @@ struct llm_build_phi2 : public llm_graph_context {
      }
  };
  
-struct llm_build_phi3 : public llm_graph_context {
-    llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_phi3_iswa : public llm_graph_context {
+    llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          const int64_t n_embd_head = hparams.n_embd_head_v;
          const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  
@@ -7217,7 +7370,7 @@ struct llm_build_phi3 : public llm_graph_context {
          // inp_pos - contains the positions
          ggml_tensor * inp_pos = build_inp_pos();
  
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
  
          for (int il = 0; il < n_layer; ++il) {
              auto * residual = inpL;
@@ -7225,7 +7378,7 @@ struct llm_build_phi3 : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  ggml_tensor* attn_norm_output = build_norm(inpL,
                          model.layers[il].attn_norm,
@@ -7977,7 +8130,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
          for (int il = 0; il < n_layer; ++il) {
              ggml_tensor * inpSA = inpL;
  
-            ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
              // norm
              cur = build_norm(inpL,
@@ -8277,8 +8430,8 @@ struct llm_build_gemma : public llm_graph_context {
      }
  };
  
-struct llm_build_gemma2 : public llm_graph_context {
-    llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma2_iswa : public llm_graph_context {
+    llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          const int64_t n_embd_head = hparams.n_embd_head_k;
  
          ggml_tensor * cur;
@@ -8292,7 +8445,7 @@ struct llm_build_gemma2 : public llm_graph_context {
          // inp_pos - contains the positions
          ggml_tensor * inp_pos = build_inp_pos();
  
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
  
          for (int il = 0; il < n_layer; ++il) {
              // norm
@@ -8414,8 +8567,8 @@ struct llm_build_gemma2 : public llm_graph_context {
      }
  };
  
-struct llm_build_gemma3 : public llm_graph_context {
-    llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_gemma3_iswa : public llm_graph_context {
+    llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          const int64_t n_embd_head = hparams.n_embd_head_k;
  
          ggml_tensor * cur;
@@ -8433,13 +8586,11 @@ struct llm_build_gemma3 : public llm_graph_context {
          ggml_tensor * inp_pos = build_inp_pos();
  
          // TODO: is causal == true correct? might need some changes
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
  
          for (int il = 0; il < n_layer; ++il) {
-            const bool is_swa = hparams.is_swa(il);
-
-            const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
-            const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+            const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+            const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  
              // norm
              cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -9016,8 +9167,8 @@ struct llm_build_command_r : public llm_graph_context {
      }
  };
  
-struct llm_build_cohere2 : public llm_graph_context {
-    llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+struct llm_build_cohere2_iswa : public llm_graph_context {
+    llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          const int64_t n_embd_head = hparams.n_embd_head_v;
  
          GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9032,7 +9183,7 @@ struct llm_build_cohere2 : public llm_graph_context {
          // inp_pos - contains the positions
          ggml_tensor * inp_pos = build_inp_pos();
  
-        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
  
          for (int il = 0; il < n_layer; ++il) {
              const bool is_swa = hparams.is_swa(il);
@@ -9045,7 +9196,7 @@ struct llm_build_cohere2 : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9983,7 +10134,7 @@ struct llm_build_deepseek : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11347,7 +11498,7 @@ struct llm_build_exaone : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12263,7 +12414,7 @@ struct llm_build_granite : public llm_graph_context {
                  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  
                  if (use_rope) {
-                    ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                    ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
                      Qcur = ggml_rope_ext(
                              ctx0, Qcur, inp_pos, rope_factors,
                              n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -12916,7 +13067,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
              // self-attention
              {
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13068,14 +13219,31 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
  
                  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  
-                res = new llama_kv_cache_unified(
-                        *this,
-                        params.type_k,
-                        params.type_v,
-                        !cparams.flash_attn,
-                        cparams.offload_kqv,
-                        cparams.n_ctx,
-                        padding);
+                if (hparams.n_swa > 0) {
+                    res = new llama_kv_cache_unified_iswa(
+                            *this,
+                            params.type_k,
+                            params.type_v,
+                            !cparams.flash_attn,
+                            cparams.offload_kqv,
+                            cparams.n_ctx,
+                            params.swa_full,
+                            cparams.n_seq_max,
+                            cparams.n_batch,
+                            padding);
+                } else {
+                    res = new llama_kv_cache_unified(
+                            *this,
+                            nullptr,
+                            params.type_k,
+                            params.type_v,
+                            !cparams.flash_attn,
+                            cparams.offload_kqv,
+                            cparams.n_ctx,
+                            padding,
+                            hparams.n_swa,
+                            hparams.swa_type);
+                }
              }
      }
  
@@ -13090,11 +13258,14 @@ llm_graph_result_ptr llama_model::build_graph(
  
      switch (arch) {
          case LLM_ARCH_LLAMA:
-        case LLM_ARCH_LLAMA4:
          case LLM_ARCH_MINICPM:
              {
                  llm = std::make_unique<llm_build_llama>(*this, params, gf);
              } break;
+        case LLM_ARCH_LLAMA4:
+            {
+                llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
+            } break;
          case LLM_ARCH_DECI:
              {
                  llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13169,7 +13340,7 @@ llm_graph_result_ptr llama_model::build_graph(
          case LLM_ARCH_PHI3:
          case LLM_ARCH_PHIMOE:
              {
-                llm = std::make_unique<llm_build_phi3>(*this, params, gf);
+                llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
              } break;
          case LLM_ARCH_PLAMO:
              {
@@ -13201,11 +13372,11 @@ llm_graph_result_ptr llama_model::build_graph(
              } break;
          case LLM_ARCH_GEMMA2:
              {
-                llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
+                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
              } break;
          case LLM_ARCH_GEMMA3:
              {
-                llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
+                llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
              } break;
          case LLM_ARCH_STARCODER2:
              {
@@ -13225,7 +13396,7 @@ llm_graph_result_ptr llama_model::build_graph(
              } break;
          case LLM_ARCH_COHERE2:
              {
-                llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
+                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
              } break;
          case LLM_ARCH_DBRX:
              {
diff --git a/src/llama-model.h b/src/llama-model.h

index 6bdec263b709b2b027db73799aaa71b5f7326225..cbea2cb331b626f6ca2f829a186ec0822b20ce76 100644 (file)
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -398,7 +398,10 @@ struct llama_model {
  
      const struct ggml_tensor * get_tensor(const char * name) const;
  
-    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+    float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+    float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
+
+    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
  
      // note: can mutate `cparams`
      // TODO: move this to new llm_arch_model_i interface
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp

index d77c40522f67e3c86d2728d71bf14a46b6c0f728..06196cf24fc89036325efec797becfe96f0e994c 100644 (file)
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -991,6 +991,7 @@ struct cmd_params_instance {
          cparams.flash_attn   = flash_attn;
          cparams.embeddings   = embeddings;
          cparams.op_offload   = !no_op_offload;
+        cparams.swa_full     = false;
  
          return cparams;
      }
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 348588a2cb22403f90ed772f2f04debdfc5c4f0a..f8b7ff062a7e0d3f8762409ab2dc6c2df59c2fd1 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2004,6 +2004,23 @@ struct server_context {
              }
          }
  
+        if (!llama_kv_self_can_shift(ctx)) {
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
+            }
+
+            if (!params_base.speculative.model.path.empty()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
+                return false;
+            }
+        }
+
          return true;
      }
  
@@ -3181,7 +3198,15 @@ struct server_context {
                                  // if we don't cache the prompt, we have to remove the entire KV cache
                                  llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                  slot.n_past = 0;
-                                slot.cache_tokens.clear();
+                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
+                            }
+
+                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
+                                if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+                                    SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
+                                            "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                    slot.n_past = 0;
+                                }
                              }
                          }
author	Georgi Gerganov <redacted>
	Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)
committer	GitHub <redacted>
	Tue, 20 May 2025 05:05:46 +0000 (08:05 +0300)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history
src/llama-graph.h		patch \| blob \| history
src/llama-hparams.h		patch \| blob \| history
src/llama-kv-cache.cpp		patch \| blob \| history
src/llama-kv-cache.h		patch \| blob \| history
src/llama-memory.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-model.h		patch \| blob \| history
tools/llama-bench/llama-bench.cpp		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history