llama : optimize defrag moves + fix fragmentation calculation (#6037)

author Michael Podvitskiy <redacted>

Thu, 14 Mar 2024 10:56:48 +0000 (11:56 +0100)

committer GitHub <redacted>

Thu, 14 Mar 2024 10:56:48 +0000 (12:56 +0200)
author Michael Podvitskiy <redacted>
Thu, 14 Mar 2024 10:56:48 +0000 (11:56 +0100)
committer GitHub <redacted>
Thu, 14 Mar 2024 10:56:48 +0000 (12:56 +0200)
diff --git a/llama.cpp b/llama.cpp

index 38e7036a72720dada20b54a48719f530462c2182..ff467c5756a1a969a867617e4b67de433844e7f2 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -9036,8 +9036,8 @@ static int llama_decode_internal(
      //llama_synchronize(&lctx);
  
      // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
+    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
+        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
  
          // queue defragmentation for next llama_kv_cache_update
          if (fragmentation > cparams.defrag_thold) {
@@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
      // number of cells moved
      uint32_t n_moves = 0;
  
+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
+
      // determine which KV cells to move where
      //
      //  cell i moves to ids[i]
@@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
              nh++;
          }
  
-        // each move requires 6*n_layer tensors (see build_defrag)
-        //   - source view, destination view, copy operation
-        //   - x2 for keys and values
-        //
-        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
-            // the graph is too big, we cannot move more cells
-            break;
-        }
-
          uint32_t nf = 0;
          uint32_t is = n_kv - 1;
  
@@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
          // are we moving a continuous block of memory?
          bool cont = false;
  
+        // should we stop searching for the next move?
+        bool stop = false;
+
          // go back and move the nf cells to the hole
          for (; i1 < n_kv; ++i1) {
              auto & cell1 = kv_self.cells[i1];
  
              if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
                  cont = false;
                  continue;
              }
@@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
              }
          }
  
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
          //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
  
          i0 += nh - 1;
author	Michael Podvitskiy <redacted>
	Thu, 14 Mar 2024 10:56:48 +0000 (11:56 +0100)
committer	GitHub <redacted>
	Thu, 14 Mar 2024 10:56:48 +0000 (12:56 +0200)