vulkan: Support ne[3]>1 in noncontig matrix-vector multiply (llama/15015)

author Jeff Bolz <redacted>

Sat, 2 Aug 2025 08:48:30 +0000 (03:48 -0500)

committer Georgi Gerganov <redacted>

Sat, 2 Aug 2025 14:51:21 +0000 (17:51 +0300)
author Jeff Bolz <redacted>
Sat, 2 Aug 2025 08:48:30 +0000 (03:48 -0500)
committer Georgi Gerganov <redacted>
Sat, 2 Aug 2025 14:51:21 +0000 (17:51 +0300)
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp

index 2cd32fbb578ad8e8749ccdd7339e036e902905db..648cdd79b7dd302d10e24e4ecbfacc21a5a457bf 100644 (file)
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2885,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
              ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len,              mul_mat_vec_p021_f16_f32_data,              "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
          }
      }
-    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
  
      ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
      ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@@ -5821,7 +5821,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
      const uint64_t ne00 = src0->ne[0];
      const uint64_t ne01 = src0->ne[1];
      const uint64_t ne02 = src0->ne[2];
-    // const uint64_t ne03 = src0->ne[3];
+    const uint64_t ne03 = src0->ne[3];
  
      const uint64_t nb01 = src0->nb[1];
      const uint64_t nb02 = src0->nb[2];
@@ -5833,7 +5833,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
      const uint64_t ne12 = src1->ne[2];
      // const uint64_t ne13 = src1->ne[3];
  
+    const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
+    const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
+    const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
+
      GGML_ASSERT(ne11 == 1);
+    GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
  
      ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
      ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
@@ -5849,7 +5854,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
          src1_uma = d_Qy != nullptr;
      }
  
-    const uint64_t d_ne = ne01 * ne11 * ne12;
+    const uint64_t d_ne = ne01 * ne11 * ne12 * ne03;
  
      const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
      const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
@@ -5884,10 +5889,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
      const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
  
      // compute
-    const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
+    const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
      ggml_vk_sync_buffers(subctx);
      ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
-        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
  }
  
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
diff --git a/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp b/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp

index bc633369f9bb58dc4445cfddb8503706cb26babb..638878d94ce080b7145953d0f1610107d5f22a3f 100644 (file)
--- a/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
+++ b/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
@@ -26,6 +26,9 @@ layout (push_constant) uniform parameter
      uint ne12;
      uint b_offset;
      uint d_offset;
+    uint nb03;
+    uint nb13;
+    uint nb23;
  } p;
  
  shared FLOAT_TYPE tmp[BLOCK_SIZE];
@@ -34,6 +37,7 @@ void main() {
      const uint tid       = gl_LocalInvocationID.x;
      const uint row_x     = gl_GlobalInvocationID.y;
      const uint channel   = gl_GlobalInvocationID.z;
+    const uint i3        = gl_WorkGroupID.x;
      const uint channel_x = channel / p.channel_x_divisor;
      const uint channel_y = channel % p.ne12;
  
@@ -41,7 +45,7 @@ void main() {
      const uint nrows_dst = p.nrows_x;
      const uint row_dst   = row_x;
  
-    const uint idst = channel*nrows_dst + row_dst;
+    const uint idst = i3*p.nb23 + channel*nrows_dst + row_dst;
  
      FLOAT_TYPE temp = 0.0f;
  
@@ -58,8 +62,8 @@ void main() {
  
                  const uint row_y = col_x;
  
-                const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-                const uint iy = channel_y*p.channel_stride_y + row_y;
+                const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+                const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
  
                  const vec4 av4 = vec4(data_a_v4[ix / 4]);
                  const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@@ -74,8 +78,8 @@ void main() {
  
              const uint row_y = col_x;
  
-            const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-            const uint iy = channel_y*p.channel_stride_y + row_y;
+            const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+            const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
  
              const vec4 av4 = vec4(data_a_v4[ix / 4]);
              const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@@ -91,8 +95,8 @@ void main() {
  
              const uint row_y = col_x;
  
-            const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-            const uint iy = channel_y*p.channel_stride_y + row_y;
+            const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+            const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
  
              const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
  
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 479b3fad486848ede2c22cfe4fc66bdb74c78981..ea65f1a2ee4dd2d2c625c90b8b045b2ee0cf51d7 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5592,13 +5592,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
      test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
      test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
  
-    for (auto bs : {1,2,4,8}) {
-        for (auto nr : {1,4}) {
-            for (uint32_t m = 0; m < 2; ++m) {
-                for (uint32_t k = 0; k < 2; ++k) {
-                    for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
-                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3}));
-                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true));
+    for (auto bs2 : {1,3}) {
+        for (auto bs : {1,2,4,8}) {
+            for (auto nr : {1,4}) {
+                for (uint32_t m = 0; m < 2; ++m) {
+                    for (uint32_t k = 0; k < 2; ++k) {
+                        for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  bs2}, {nr, 1}, {0, 2, 1, 3}));
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  bs2}, {nr, 1}, {0, 1, 2, 3}, true));
+                        }
                      }
                  }
              }
author	Jeff Bolz <redacted>
	Sat, 2 Aug 2025 08:48:30 +0000 (03:48 -0500)
committer	Georgi Gerganov <redacted>
	Sat, 2 Aug 2025 14:51:21 +0000 (17:51 +0300)
src/ggml-vulkan/ggml-vulkan.cpp		patch \| blob \| history
src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history