vulkan: fix MMQ quantize_y condition (llama/17301)

author Ruben Ortlam <redacted>

Sun, 16 Nov 2025 18:38:17 +0000 (19:38 +0100)

committer Georgi Gerganov <redacted>

Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)
author Ruben Ortlam <redacted>
Sun, 16 Nov 2025 18:38:17 +0000 (19:38 +0100)
committer Georgi Gerganov <redacted>
Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index ef99c3c1eba45b0c5801ecab07f95621a9e2c890..5bdc675cf6e381ef447e1f81978b746ee8ee6786 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -6444,7 +6444,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
  
      const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
  
-    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0;
+    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0;
  
      // Check for mmq first
      vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr;
@@ -6731,7 +6731,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
      const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
  
      const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
-    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type);
+    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type);
  
      vk_pipeline to_fp16_vk_0 = nullptr;
      vk_pipeline to_fp16_vk_1 = nullptr;
@@ -7220,7 +7220,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
  
      const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
  
-    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0;
+    bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0;
  
      // Check for mmq first
      vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr;
author	Ruben Ortlam <redacted>
	Sun, 16 Nov 2025 18:38:17 +0000 (19:38 +0100)
committer	Georgi Gerganov <redacted>
	Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)