kv-cache : support V-less cache (llama/19067)

author Georgi Gerganov <redacted>

Sun, 25 Jan 2026 13:48:56 +0000 (15:48 +0200)

committer Georgi Gerganov <redacted>

Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
author Georgi Gerganov <redacted>
Sun, 25 Jan 2026 13:48:56 +0000 (15:48 +0200)
committer Georgi Gerganov <redacted>
Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
diff --git a/src/ggml-cuda/fattn-common.cuh b/src/ggml-cuda/fattn-common.cuh

index 40c7725784cab9b5ac1fcd00b29fe8b9f05dfa6d..13c5b0a4594505bc0187c1f2068b94bbadcc6d3a 100644 (file)
--- a/src/ggml-cuda/fattn-common.cuh
+++ b/src/ggml-cuda/fattn-common.cuh
@@ -782,7 +782,7 @@ void launch_fattn(
      const ggml_tensor * K = dst->src[1];
      const ggml_tensor * V = dst->src[2];
  
-    const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
+    const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
  
      const ggml_tensor * mask  = dst->src[3];
      const ggml_tensor * sinks = dst->src[4];
diff --git a/src/ggml-cuda/fattn.cu b/src/ggml-cuda/fattn.cu

index ba2b96bc3278aa8454db3af2380b2a556266fdd9..a5e662418174a101f56100748ee7013a2b77203e 100644 (file)
--- a/src/ggml-cuda/fattn.cu
+++ b/src/ggml-cuda/fattn.cu
@@ -247,7 +247,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
          }
      }
  
-    const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
+    const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
  
      const int cc = ggml_cuda_info().devices[device].cc;
author	Georgi Gerganov <redacted>
	Sun, 25 Jan 2026 13:48:56 +0000 (15:48 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
src/ggml-cuda/fattn-common.cuh		patch \| blob \| history
src/ggml-cuda/fattn.cu		patch \| blob \| history