vulkan: Skip syncing for prealloc_y when it is reused (#15544)

author Jeff Bolz <redacted>

Sat, 30 Aug 2025 09:11:22 +0000 (04:11 -0500)

committer GitHub <redacted>

Sat, 30 Aug 2025 09:11:22 +0000 (11:11 +0200)
author Jeff Bolz <redacted>
Sat, 30 Aug 2025 09:11:22 +0000 (04:11 -0500)
committer GitHub <redacted>
Sat, 30 Aug 2025 09:11:22 +0000 (11:11 +0200)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index 04ad664e61c0731317919d4218477cf15f16411c..40962de50836fd1ae3bfd3411959929f8e813392 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5800,11 +5800,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
              ggml_vk_sync_buffers(ctx, subctx);
          }
      }
-    if (y_non_contig || quantize_y) {
-        if (ctx->prealloc_y_need_sync) {
-            ggml_vk_sync_buffers(ctx, subctx);
-        }
-    }
  
      if (x_non_contig) {
          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -5816,6 +5811,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
      if (y_non_contig) {
          if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
              ctx->prealloc_y_last_tensor_used != src1) {
+            if (ctx->prealloc_y_need_sync) {
+                ggml_vk_sync_buffers(ctx, subctx);
+            }
              ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
              ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
              ctx->prealloc_y_last_tensor_used = src1;
@@ -5824,6 +5822,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
      if (quantize_y) {
          if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
              ctx->prealloc_y_last_tensor_used != src1) {
+            if (ctx->prealloc_y_need_sync) {
+                ggml_vk_sync_buffers(ctx, subctx);
+            }
              ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
              ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
              ctx->prealloc_y_last_tensor_used = src1;
@@ -6008,11 +6009,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
              ggml_vk_sync_buffers(ctx, subctx);
          }
      }
-    if (y_non_contig) {
-        if (ctx->prealloc_y_need_sync) {
-            ggml_vk_sync_buffers(ctx, subctx);
-        }
-    }
  
      if (x_non_contig) {
          GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
@@ -6022,6 +6018,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
          GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
          if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
              ctx->prealloc_y_last_tensor_used != src1) {
+            if (ctx->prealloc_y_need_sync) {
+                ggml_vk_sync_buffers(ctx, subctx);
+            }
              ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
              ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
              ctx->prealloc_y_last_tensor_used = src1;
@@ -6454,11 +6453,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
              ggml_vk_sync_buffers(ctx, subctx);
          }
      }
-    if (y_non_contig) {
-        if (ctx->prealloc_y_need_sync) {
-            ggml_vk_sync_buffers(ctx, subctx);
-        }
-    }
  
      if (x_non_contig) {
          ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -6471,6 +6465,9 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
      if (y_non_contig) {
          if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
              ctx->prealloc_y_last_tensor_used != src1) {
+            if (ctx->prealloc_y_need_sync) {
+                ggml_vk_sync_buffers(ctx, subctx);
+            }
              ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
              ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
              ctx->prealloc_y_last_tensor_used = src1;
@@ -6668,11 +6665,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
              ggml_vk_sync_buffers(ctx, subctx);
          }
      }
-    if (y_non_contig) {
-        if (ctx->prealloc_y_need_sync) {
-            ggml_vk_sync_buffers(ctx, subctx);
-        }
-    }
  
      if (x_non_contig) {
          GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
@@ -6682,6 +6674,9 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
          GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
          if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
              ctx->prealloc_y_last_tensor_used != src1) {
+            if (ctx->prealloc_y_need_sync) {
+                ggml_vk_sync_buffers(ctx, subctx);
+            }
              ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
              ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
              ctx->prealloc_y_last_tensor_used = src1;
author	Jeff Bolz <redacted>
	Sat, 30 Aug 2025 09:11:22 +0000 (04:11 -0500)
committer	GitHub <redacted>
	Sat, 30 Aug 2025 09:11:22 +0000 (11:11 +0200)