ggml : remove GGML_KQ_MASK_PAD constant (#17910)

author Georgi Gerganov <redacted>

Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)

committer GitHub <redacted>

Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)
author Georgi Gerganov <redacted>
Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)
committer GitHub <redacted>
Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 6bc762c069743ecf4b6f3f4e0da894b04c010090..686da3dbd107835c9454532da9ee1ddd1ea9f5c8 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2305,13 +2305,11 @@ extern "C" {
              float                 stop,
              float                 step);
  
-#define GGML_KQ_MASK_PAD 1
-
-    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
-    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
-    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
-    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
+    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch, ne32,      ne33]
+    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
      //
      // broadcast:
      //   n_head % n_head_kv == 0
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 530ff7b95399e812606e28a4272c3db2f535b402..f0913cd35967f43e19ca9fe7a9d95ed5a2d3d550 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5260,8 +5260,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
  
      if (mask) {
          GGML_ASSERT(ggml_is_contiguous(mask));
-        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
-                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
          //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
  
          GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 4171400713d32573e2dbe5d99405f1914a3bf2d6..2692297dcaba20116e49a601d4135c238ce4f596 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,14 +93,6 @@ llama_context::llama_context(
      // with causal attention, the batch size is limited by the context size
      cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
      cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
  
      cparams.op_offload = params.op_offload;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 43620df7809b9904ef7b5eec1231c9cb11376b44..6cf9a883a6ebb6c74d6936b629333ed42849f741 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
    //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
  
      res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
  
      return res;
  }
@@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
    //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
  
      res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
  
      res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
-    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
  
      return res;
  }
@@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
              }
          }
  
-        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+        for (int i = n_tokens; i < n_tokens; ++i) {
              for (int j = 0; j < n_enc; ++j) {
                  data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
              }
@@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
      auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
  
      // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
      ggml_set_input(inp->self_kq_mask);
  
      inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
  
      if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
          ggml_set_input(inp->self_kq_mask_swa);
  
          inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
          inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask);
  
          inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
  
      const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
  
-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
      ggml_set_input(inp->cross_kq_mask);
  
      inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
          inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask);
  
          inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
          inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask_swa);
  
          inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp

index e26385a1feaf16460d0b13e0502b2f4d64d16a28..3e02bd62977067ca3e7515fae29649d567d10c4a 100644 (file)
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
      GGML_ASSERT(n_tokens%n_stream == 0);
  
      // n_tps == n_tokens_per_stream
-    const int64_t n_tps     = n_tokens/n_stream;
-    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
+    const int64_t n_tps = n_tokens/n_stream;
  
      std::fill(data, data + ggml_nelements(dst), -INFINITY);
  
@@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                  const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                  const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
  
-                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
  
                  for (uint32_t j = 0; j < n_kv; ++j) {
                      if (cells.is_empty(j)) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index a6f266601feeee99a8ef5d0a03fd634d9660ceca..7be1f6603877ceabd8c1830592c9079c9f542404 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5875,7 +5875,7 @@ struct test_flash_attn_ext : public test_case {
  
          ggml_tensor * m = nullptr;
          if (mask) {
-            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
              ggml_set_name(m, "m");
          }
  
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 3ed08a0fec655e42e72e5628059661372dfd138b..e5f7117dbfa0c29237d482444790c4805fdd15a6 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -775,10 +775,6 @@ struct clip_graph {
  
              // if flash attn is used, we need to pad the mask and cast to f16
              if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-                int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
-                if (n_pad > 0) {
-                    window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
-                }
                  window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
              }
  
@@ -791,7 +787,7 @@ struct clip_graph {
  
          // loop over layers
          for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.layers[il];
+            const auto & layer = model.layers[il];
              const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
  
              ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
author	Georgi Gerganov <redacted>
	Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)
committer	GitHub <redacted>
	Wed, 10 Dec 2025 18:53:16 +0000 (20:53 +0200)
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history
src/llama-kv-cache.cpp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history