) {
constexpr int ncols = ncols1 * ncols2;
- const bool is_mla = DV == 512; // TODO better parameterization
-
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
+ // TODO: make this more generic by removing the notion of "MLA".
+ // for example "is V a view of K?" so we can skip loading it.
+ // V strides should be driven by V itself and avoid assumption of the data layout
+ const bool is_mla = V->op == GGML_OP_VIEW && V->src[0] == K;
+
GGML_ASSERT(V || is_mla);
const ggml_tensor * mask = dst->src[3];
// For MLA K and V have the same data.
// Therefore, iterate over V in reverse and re-use the data if possible.
static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented");
- constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV;
+ // constexpr int reusable_cutoff = mla ? (DV - 1) - (DV - 1) % (2*nbatch_K2) : DV;
+ constexpr int reusable_cutoff = DV; // TODO implement properly
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
T_A_VKQ A_identity;
make_identity_mat(A_identity);
(const half *) (mask + nb33*(sequence % ne33));
float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
- const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
+ const half2 * V_h2 = mla ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
(const half *) (mask + nb33*(sequence % ne33));
float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
- const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
+ const half2 * V_h2 = mla ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1], true); // the K tensor is usually a view of the K cache
ggml_set_name(k, "k");
- ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache
+ ggml_tensor * v = nullptr;
+ if (hsk_padded == 576 && hsv_padded == 512) {
+ // TODO: this branch should become a separate test case parameter instead of hardcoding this for these head shapes
+
+ // in this branch, the V cache is sub-view of the K cache. this is used by some MLA-based models
+ // for more info:
+ // - https://github.com/ggml-org/llama.cpp/pull/13435
+ // - https://github.com/ggml-org/llama.cpp/pull/18953#issuecomment-3774948392
+ // - https://github.com/ggml-org/llama.cpp/pull/18986
+ v = ggml_view_4d(ctx, k, hsv_padded, kv, nh, nr23[1], k->nb[1], k->nb[2], k->nb[3], 0);
+ } else {
+ v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache
+ }
ggml_set_name(v, "v");
ggml_tensor * m = nullptr;