vulkan: remove the need for the dryrun (llama/16826)

author Jeff Bolz <redacted>

Tue, 4 Nov 2025 19:28:17 +0000 (13:28 -0600)

committer Georgi Gerganov <redacted>

Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)
author Jeff Bolz <redacted>
Tue, 4 Nov 2025 19:28:17 +0000 (13:28 -0600)
committer Georgi Gerganov <redacted>
Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp

index 8d1a85c96939b0b06a7a80bdde13edccb861381b..7fc46bc46bc395fd3ac6739c824462c087111a1d 100644 (file)
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -129,7 +129,7 @@ struct vk_pipeline_struct {
      uint32_t align;
      // true if fields have been set by ggml_vk_create_pipeline
      bool initialized {};
-    // set to true to request the pipeline is compiled after the dryrun
+    // set to true to request the pipeline is compiled
      bool needed {};
      // set to true when the shader has been compiled
      bool compiled {};
@@ -539,9 +539,6 @@ struct vk_device_struct {
      bool mul_mat_id_m[GGML_TYPE_COUNT];
      bool mul_mat_id_s[GGML_TYPE_COUNT];
  
-    // set to true to indicate that some shaders need to be compiled after the dryrun
-    bool need_compiles {};
-
      vk::DescriptorSetLayout dsl;
  
      vk_matmul_pipeline pipeline_matmul_f32 {};
@@ -1408,6 +1405,10 @@ struct ggml_vk_garbage_collector {
      std::vector<vk_context> contexts;
  };
  
+static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
+static void ggml_vk_load_shaders(vk_device& device);
+static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
+
  #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
  #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
  
@@ -1561,8 +1562,11 @@ struct ggml_backend_vk_context {
      bool almost_ready_fence_pending {};
      // Set before op_add and unset after op_rms_norm to indicate that the add should
      // write partial sums to accumulate the square of the vector components
+    bool do_add_rms_partials_offset_calculation;
      bool do_add_rms_partials;
  
+    uint64_t last_total_mul_mat_bytes {};
+
      // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
      vk_pipeline_struct * prealloc_y_last_pipeline_used {};
      const ggml_tensor * prealloc_y_last_tensor_used {};
@@ -1865,8 +1869,9 @@ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx,
      ctx->pipeline_descriptor_set_requirements += n;
      if (!pipeline->compiled) {
          pipeline->needed = true;
-        ctx->device->need_compiles = true;
+        ggml_vk_load_shaders(ctx->device);
      }
+    ggml_pipeline_allocate_descriptor_sets(ctx);
  }
  
  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
@@ -1878,7 +1883,9 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
  
      vk_device& device = ctx->device;
  
-    uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
+    // Grow by 50% to avoid frequent allocations
+    uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
+    uint32_t to_alloc = needed - ctx->descriptor_sets.size();
      uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
      uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
  
@@ -3916,7 +3923,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
      for (auto &c : compiles) {
          c.wait();
      }
-    device->need_compiles = false;
  }
  
  static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
@@ -5020,6 +5026,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
      ctx->prealloc_size_x = 0;
      ctx->prealloc_size_y = 0;
      ctx->prealloc_size_split_k = 0;
+    ctx->prealloc_size_add_rms_partials = 0;
  
      ctx->fence = ctx->device->device.createFence({});
      ctx->almost_ready_fence = ctx->device->device.createFence({});
@@ -6204,11 +6211,11 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
      ggml_vk_sync_buffers(ctx, subctx);
  }
  
-static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k, bool dryrun = false) {
+static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) {
      VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "))");
      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16);  // NOLINT
      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
  
@@ -6322,7 +6329,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
          to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
      }
  
-    if (dryrun) {
+    {
          const uint64_t x_sz_upd = x_sz * ne02 * ne03;
          uint64_t y_sz_upd = y_sz * ne12 * ne13;
          if (quantize_y) {
@@ -6337,12 +6344,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
          }
          if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
              ctx->prealloc_size_x = x_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
          if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
              ctx->prealloc_size_y = y_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
          if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
              ctx->prealloc_size_split_k = split_k_size;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
  
          // Request descriptor sets
@@ -6359,7 +6369,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
          if (split_k > 1) {
              ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
          }
-        return;
      }
  
      vk_buffer d_D = dst_buf_ctx->dev_buffer;
@@ -6515,7 +6524,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
      GGML_UNUSED(k);
  }
  
-static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
@@ -6523,7 +6532,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
      VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
+    std::cerr << ")),)");
      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16);  // NOLINT
      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
  
@@ -6619,7 +6628,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
      const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
      const uint64_t d_sz = sizeof(float) * d_ne;
  
-    if (dryrun) {
+    {
          const uint64_t x_sz_upd = x_sz * ne02 * ne03;
          uint64_t y_sz_upd = y_sz * ne12 * ne13;
          if (quantize_y) {
@@ -6632,9 +6641,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
          }
          if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
              ctx->prealloc_size_x = x_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
          if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
              ctx->prealloc_size_y = y_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
  
          // Request descriptor sets
@@ -6648,7 +6659,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
              ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
          }
          ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
-        return;
      }
  
      vk_buffer d_D;
@@ -6806,14 +6816,14 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
      }
  }
  
-static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
      VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "))");
      GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
      GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]);  // NOLINT
      GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]);  // NOLINT
@@ -6859,10 +6869,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
          gqa_ratio = 1;
      }
  
-    if (dryrun) {
+    {
          // Request descriptor sets
          ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
-        return;
      }
  
      vk_buffer d_D;
@@ -6936,14 +6945,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
          }, pc, { 1, (uint32_t)ne01, workgroups_z });
  }
  
-static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
      VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "))");
      GGML_ASSERT(!ggml_is_transposed(src0));
      GGML_ASSERT(!ggml_is_transposed(src1));
      GGML_ASSERT(!ggml_is_permuted(src0));
@@ -6995,10 +7004,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
      const uint64_t qy_sz = ggml_nbytes(src1);
      const uint64_t d_sz = sizeof(float) * d_ne;
  
-    if (dryrun) {
+    {
          // Request descriptor sets
          ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
-        return;
      }
  
      vk_buffer d_D;
@@ -7066,7 +7074,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
          }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
  }
  
-static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      ggml_tensor * src0 = dst->src[0];
      ggml_tensor * src1 = dst->src[1];
@@ -7094,7 +7102,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
              dst2.ne[0] = cur_M_size;
              src02.ne[1] = cur_M_size;
  
-            ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true, dryrun);
+            ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true);
  
              m_offset += cur_M_size;
          }
@@ -7108,21 +7116,21 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
          src1->nb[1] <= src1->nb[3] &&
          src0->ne[3] == 1 &&
          src1->ne[3] == 1) {
-        ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx);
      } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
                 !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
-        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx);
      // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
      // when ne12 and ne13 are one.
      } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
                 (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) {
-        ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx);
      } else {
-        ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false, dryrun);
+        ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false);
      }
  }
  
-static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
      VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -7251,7 +7259,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
          to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
      }
  
-    if (dryrun) {
+    {
          const uint64_t x_sz_upd = x_sz * ne02 * ne03;
          uint64_t y_sz_upd = y_sz * ne12 * ne13;
          if (quantize_y) {
@@ -7264,9 +7272,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
          }
          if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
              ctx->prealloc_size_x = x_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
          if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
              ctx->prealloc_size_y = y_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
  
          // Request descriptor sets
@@ -7280,7 +7290,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
          if (quantize_y) {
              ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
          }
-        return;
      }
  
      vk_buffer d_D = dst_buf_ctx->dev_buffer;
@@ -7396,7 +7405,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
      }
  }
  
-static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      ggml_tensor * src0 = dst->src[0];
      ggml_tensor * src1 = dst->src[1];
@@ -7405,7 +7414,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
      std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
      std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "))");
      GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16);  // NOLINT
      GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
      GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -7493,7 +7502,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
      GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
      GGML_ASSERT(dmmv != nullptr);
  
-    if (dryrun) {
+    {
          const uint64_t x_sz_upd = x_sz * ne02 * ne03;
          const uint64_t y_sz_upd = y_sz * ne12 * ne13;
          if (
@@ -7503,9 +7512,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
          }
          if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
              ctx->prealloc_size_x = x_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
          if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
              ctx->prealloc_size_y = y_sz_upd;
+            ggml_vk_preallocate_buffers(ctx, subctx);
          }
  
          // Request descriptor sets
@@ -7516,7 +7527,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
              ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
          }
          ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
-        return;
      }
  
      vk_buffer d_D;
@@ -7664,16 +7674,16 @@ static bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int no
      return src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type));
  }
  
-static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      ggml_tensor * src0 = dst->src[0];
      ggml_tensor * src1 = dst->src[1];
      ggml_tensor * src2 = dst->src[2];
      VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
      if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) {
-        ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx);
      } else {
-        ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+        ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
      }
  }
  
@@ -7733,7 +7743,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
      return supported;
  }
  
-static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) {
      VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
      std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
      std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
@@ -7741,7 +7751,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
      if (sinks) {
          std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3];
      }
-    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "))");
  
      GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
      GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
@@ -7915,15 +7925,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
      }
      if (ctx->prealloc_size_split_k < split_k_size) {
          ctx->prealloc_size_split_k = split_k_size;
+        ggml_vk_preallocate_buffers(ctx, subctx);
      }
  
-    if (dryrun) {
+    {
          // Request descriptor sets
          ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
          if (split_k > 1) {
              ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
          }
-        return;
      }
  
      float scale         = 1.0f;
@@ -8727,7 +8737,7 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
  }
  
  template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) {
      VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
      if (src1 != nullptr) {
          std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -8739,7 +8749,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
          std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3];
      }
      std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
-    std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
+    std::cerr << "), " << ggml_op_name(op) << ")");
      GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT
      GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0));  // NOLINT
      GGML_ASSERT(dst->buffer != nullptr);
@@ -8790,10 +8800,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
          GGML_ABORT("fatal error");
      }
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
  
@@ -9174,7 +9181,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
      }
  }
  
-static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9186,10 +9193,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, 0,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9206,10 +9213,10 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, offset,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
      const ggml_tensor *first_node = cgraph->nodes[node_idx];
      const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
  
@@ -9254,10 +9261,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx,
          GGML_ABORT("fatal error");
      }
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT];
      vk_buffer buf[MAX_PARAMETER_COUNT];
@@ -9319,7 +9323,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx,
          }, pc, elements);
  }
  
-static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9331,10 +9335,10 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, ctx->do_add_rms_partials,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9346,10 +9350,10 @@ static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, 0,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9361,10 +9365,10 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, 0,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9376,10 +9380,10 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, 0,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t src2_type_size = ggml_type_size(src2->type);
@@ -9391,10 +9395,10 @@ static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, co
          (uint32_t)src0->nb[2] / src0_type_size,
          (uint32_t)src1->nb[1] / src1_type_size,
          (uint32_t)src2->nb[1] / src2_type_size,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) {
+static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) {
      GGML_ASSERT(version == 6 || version == 7);
      int num_srcs = version == 6 ? 6 : 7;
  
@@ -9407,10 +9411,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
      vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op);
      GGML_ASSERT(pipeline != nullptr);
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
      ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
@@ -9480,7 +9481,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
      }
  }
  
-static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
      const size_t seq_length = dst->src[0]->ne[2];
      const size_t n_embed = dst->ne[0];
      const size_t n_heads = dst->src[0]->ne[1];
@@ -9494,12 +9495,11 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx,
              (uint32_t)n_embed,
              (uint32_t)n_heads,
          },
-        6,
-        dryrun
+        6
      );
  }
  
-static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
      const size_t seq_length = dst->src[0]->ne[2];
      const size_t n_embed = dst->ne[0];
      const size_t n_heads = dst->src[0]->ne[1];
@@ -9513,12 +9513,11 @@ static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx,
              (uint32_t)n_embed,
              (uint32_t)n_heads,
          },
-        7,
-        dryrun
+        7
      );
  }
  
-static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
      const ggml_tensor * src2 = dst->src[2];
@@ -9540,10 +9539,7 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
      vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op);
      GGML_ASSERT(pipeline != nullptr);
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      const int64_t s_off = ggml_nelements(src1) * sizeof(float);
  
@@ -9613,7 +9609,7 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
      }, pc, elements);
  }
  
-static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
  
@@ -9626,10 +9622,10 @@ static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx,
          (uint32_t)src0->ne[1],
          (uint32_t)dst->ne[1],
          (uint32_t)dst->ne[2],
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc, bool dryrun = false) {
+static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) {
      const ggml_tensor * x = dst->src[0];
      const ggml_tensor * g = dst->src[1];
      const ggml_tensor * gm = dst->src[2];
@@ -9655,10 +9651,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
      vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW);
      GGML_ASSERT(pipeline != nullptr);
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      ggml_backend_vk_buffer_context * x_buf_ctx = (ggml_backend_vk_buffer_context *)x->buffer->context;
      ggml_backend_vk_buffer_context * g_buf_ctx = (ggml_backend_vk_buffer_context *)g->buffer->context;
@@ -9722,23 +9715,22 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
      }, pc, elements);
  }
  
-static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
      const size_t n = ggml_nelements(dst->src[0]);
  
      ggml_vk_op_f32_opt_step_adamw(
          ctx, subctx, dst,
-        { (uint32_t)n, 0, 0.0f, 0.0f },
-        dryrun
+        { (uint32_t)n, 0, 0.0f, 0.0f }
      );
  }
  
-static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
      const size_t n = ggml_nelements(dst->src[0]);
  
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f });
  }
  
-static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      int * op_params = (int *)dst->op_params;
  
      const uint32_t src0_type_size = ggml_type_size(src0->type);
@@ -9752,10 +9744,10 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, op_params[0],
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
  
@@ -9779,47 +9771,47 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
          (uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size,
          (uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
          sf0, sf1, sf2, sf3, pixel_offset
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
      p.param1 = ggml_get_op_params_f32(dst, 0);
      p.param2 = ggml_get_op_params_f32(dst, 1);
  
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p));
  }
  
-static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst));
  }
  
-static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst));
  }
  
-static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst));
  }
  
-static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst));
  }
  
-static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
      p.param1 = ggml_get_op_params_f32(dst, 0);
      p.param2 = ggml_get_op_params_f32(dst, 1);
  
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p));
  }
  
-static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst);
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p));
  }
  
-static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      const int32_t s0 = ggml_get_op_params_i32(dst, 0);
      const int32_t s1 = ggml_get_op_params_i32(dst, 1);
      const int32_t s2 = ggml_get_op_params_i32(dst, 2);
@@ -9831,20 +9823,20 @@ static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, cons
      memcpy(&p.param1, &s01_packed, sizeof(float));
      memcpy(&p.param2, &s23_packed, sizeof(float));
  
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p));
  }
  
-static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p));
  }
  
-static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p));
  }
  
-static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      uint32_t ne = (uint32_t)ggml_nelements(src0);
      if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
          // Convert from number of logical elements to 2- or 4-byte units.
@@ -9857,10 +9849,10 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
      }
  
      vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p));
  }
  
-static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9879,20 +9871,20 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          0.0f, 0.0f, 0,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
  }
  
-static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      float * op_params = (float *)dst->op_params;
  
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
  }
  
-static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      const int * int_op_params = (const int *)dst->op_params;
      const float * float_op_params = (const float *)dst->op_params;
  
@@ -9900,7 +9892,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
      const float eps = float_op_params[1];
      const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
  
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f });
  }
  
  static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
@@ -9916,7 +9908,7 @@ static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const g
      return num_bytes;
  }
  
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) {
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
      const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -9930,29 +9922,30 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
          (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
          0,
          op_params[0], 0.0f, (int32_t)param3,
-    }, dryrun);
+    });
  
-    if (ctx->do_add_rms_partials) {
+    if (ctx->do_add_rms_partials_offset_calculation) {
          ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0);
          ctx->do_add_rms_partials = false;
+        ctx->do_add_rms_partials_offset_calculation = false;
      }
  }
  
-static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
  }
  
-static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
  }
  
-static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
  }
  
-static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const float * op_params_f = (const float *)dst->op_params;
  
      const bool swapped = (bool)dst->op_params[1];
@@ -9980,15 +9973,15 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
              mode,
              alpha,
              limit
-        }, dryrun);
+        });
  }
  
-static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      int32_t * op_params = (int32_t *)dst->op_params;
-    ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
+    ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
  }
  
-static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
      float * op_params = (float *)dst->op_params;
  
      float scale = op_params[0];
@@ -10021,16 +10014,15 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
          n_head_log2,
          nrows_x,
          src2 != nullptr
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] });
  }
  
-static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
-
+static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
      topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
      ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
      ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
@@ -10050,10 +10042,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
  
      vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX);
  
-    if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        return;
-    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
  
      ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context;
      ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context;
@@ -10117,7 +10106,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
          }, pc, elements);
  }
  
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop, bool dryrun = false) {
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
      ggml_tensor * dst = cgraph->nodes[node_idx];
      const ggml_tensor * src0 = dst->src[0];
      const ggml_tensor * src1 = dst->src[1];
@@ -10162,10 +10151,10 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
          freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
          src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
          { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      int32_t * op_params = (int32_t *)dst->op_params;
  
      uint32_t ncols = src0->ne[0];
@@ -10175,34 +10164,34 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
          ncols,
          nrows,
          op_params[0],
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p);
  }
  
-static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p);
  }
  
-static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
      p.weight = 1.0f / (float)src0->ne[0];
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p);
  }
  
-static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
+static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
  }
  
-static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
  }
  
-static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const int32_t s0 = dst->op_params[0];
      const int32_t s1 = dst->op_params[1];
      const int32_t p0 = dst->op_params[2];
@@ -10239,10 +10228,10 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
          pelements,
          IC * KH * KW,
          s0, s1, p0, p1, d0, d1,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      GGML_TENSOR_BINARY_OP_LOCALS
  
      const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
@@ -10305,20 +10294,20 @@ static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx,
      pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
      pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
  
-    ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun);
+    ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc));
  }
  
-static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      const uint32_t dim = dst->op_params[0];
      const uint32_t max_period = dst->op_params[1];
      const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
  
      ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
          nb1, dim, max_period,
-    }, dryrun);
+    });
  }
  
-static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      // src0: (K, Cout, Cin, 1) -- kernel
      // src1: (L, Cin, 1, 1) -- input
      // dst: (*, Cout, 1, 1)
@@ -10346,10 +10335,10 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
      p.nb1 = static_cast<uint32_t>(nb1 / nb0);
      p.s0 = static_cast<uint32_t>(s0);
  
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
  }
  
-static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
      const int32_t k1 = dst->op_params[1];
      const int32_t k0 = dst->op_params[2];
@@ -10374,11 +10363,11 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
          parallel_elements,
          op,
          k0, k1, s0, s1, p0, p1,
-    }, dryrun);
+    });
  }
  
  static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
-                            const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+                            const ggml_tensor * src1, ggml_tensor * dst) {
      GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
      GGML_ASSERT(src1->type == GGML_TYPE_F32);
      GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -10423,11 +10412,11 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
      GGML_ASSERT(ne03 == ne2);
      GGML_ASSERT(ne02 == ne12);
  
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p));
  }
  
  static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
-                                      const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+                                      const ggml_tensor * src1, ggml_tensor * dst) {
      GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
      GGML_ASSERT(src1->type == GGML_TYPE_F32);
      GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -10472,10 +10461,10 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
      GGML_ASSERT(ne02 == ne2);
      GGML_ASSERT(ne03 == ne12);
  
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p));
  }
  
-static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      vk_op_conv2d_dw_push_constants p{};
      p.ne = ggml_nelements(dst);
      p.channels = dst->ne[2];
@@ -10496,12 +10485,12 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
      GGML_ASSERT(src0->ne[3] == p.channels);
      GGML_ASSERT(src1->ne[3] == p.batches);
  
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p));
  }
  
-static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
      const float * op_params = (const float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
  }
  
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -10660,10 +10649,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
          }
      }
  
-    if (ctx->device->need_compiles) {
-        ggml_vk_load_shaders(ctx->device);
-    }
-
      ggml_pipeline_allocate_descriptor_sets(ctx);
  
      vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal});
@@ -10910,10 +10895,6 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
  
      ggml_pipeline_request_descriptor_sets(ctx, p, 1);
  
-    if (ctx->device->need_compiles) {
-        ggml_vk_load_shaders(ctx->device);
-    }
-
      ggml_pipeline_allocate_descriptor_sets(ctx);
  
      ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
@@ -11011,10 +10992,6 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
  //
  //     ggml_pipeline_request_descriptor_sets(ctx, p, 1);
  //
-//     if (ctx->device->need_compiles) {
-//         ggml_vk_load_shaders(ctx->device);
-//     }
-//
  //     ggml_pipeline_allocate_descriptor_sets(ctx);
  //
  //     ggml_vk_buffer_write(x_buf, 0, x, x_sz);
@@ -11185,10 +11162,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
          ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
      }
  
-    if (ctx->device->need_compiles) {
-        ggml_vk_load_shaders(ctx->device);
-    }
-
      ggml_pipeline_allocate_descriptor_sets(ctx);
  
      ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
@@ -11326,7 +11299,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
  }
  #endif
  
-static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
+static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) {
  #if defined(GGML_VULKAN_RUN_TESTS)
      const std::vector<size_t> vals {
          512, 512, 128,
@@ -11416,6 +11389,14 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
      GGML_ABORT("fatal error");
  #endif
  
+    if (subctx) {
+        // Submit and wait for any pending work before reallocating the buffers
+        ggml_vk_ctx_end(subctx);
+        ggml_vk_submit(subctx, ctx->fence);
+        ggml_vk_wait_for_fence(ctx);
+        ggml_vk_ctx_begin(ctx->device, subctx);
+    }
+
      if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
          VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
          // Resize buffer
@@ -11454,7 +11435,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph *
  
  // Returns true if node has enqueued work into the queue, false otherwise
  // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
-static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){
      ggml_tensor * node = cgraph->nodes[node_idx];
      if (ggml_is_empty(node) || !node->buffer) {
          return false;
@@ -11514,10 +11495,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
                  cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] &&
                  ggml_nrows(cgraph->nodes[next_node_idx]) == 1 &&
                  ctx->device->add_rms_fusion) {
-                if (dryrun) {
-                    ctx->prealloc_size_add_rms_partials += ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
+                uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
+                ctx->do_add_rms_partials_offset_calculation = true;
+                if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) {
+                    ctx->do_add_rms_partials = true;
                  }
-                ctx->do_add_rms_partials = true;
              }
          } break;
      case GGML_OP_REPEAT:
@@ -11585,81 +11567,15 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
  
      vk_context compute_ctx;
  
-    if (!dryrun) {
-        if (ctx->compute_ctx.expired()) {
-            compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
-            ctx->compute_ctx = compute_ctx;
-            ggml_vk_ctx_begin(ctx->device, compute_ctx);
-        } else {
-            compute_ctx = ctx->compute_ctx.lock();
-        }
+    if (ctx->compute_ctx.expired()) {
+        compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+        ctx->compute_ctx = compute_ctx;
+        ggml_vk_ctx_begin(ctx->device, compute_ctx);
      } else {
-        switch (node->op) {
-        case GGML_OP_REPEAT:
-        case GGML_OP_REPEAT_BACK:
-        case GGML_OP_ACC:
-        case GGML_OP_GET_ROWS:
-        case GGML_OP_ADD:
-        case GGML_OP_SUB:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_CONCAT:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_SIN:
-        case GGML_OP_COS:
-        case GGML_OP_CLAMP:
-        case GGML_OP_PAD:
-        case GGML_OP_CPY:
-        case GGML_OP_SET_ROWS:
-        case GGML_OP_CONT:
-        case GGML_OP_DUP:
-        case GGML_OP_SILU_BACK:
-        case GGML_OP_NORM:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_RMS_NORM_BACK:
-        case GGML_OP_L2_NORM:
-        case GGML_OP_UNARY:
-        case GGML_OP_GLU:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_SOFT_MAX_BACK:
-        case GGML_OP_ROPE_BACK:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_SUM:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-        case GGML_OP_COUNT_EQUAL:
-        case GGML_OP_IM2COL:
-        case GGML_OP_IM2COL_3D:
-        case GGML_OP_TIMESTEP_EMBEDDING:
-        case GGML_OP_CONV_TRANSPOSE_1D:
-        case GGML_OP_POOL_2D:
-        case GGML_OP_CONV_2D:
-        case GGML_OP_CONV_TRANSPOSE_2D:
-        case GGML_OP_CONV_2D_DW:
-        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_OPT_STEP_SGD:
-            {
-                // These operations all go through ggml_vk_op_f32, so short-circuit and
-                // do the only thing needed for the dryrun.
-                vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
-                ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-                if (node->op == GGML_OP_RMS_NORM) {
-                    ctx->do_add_rms_partials = false;
-                }
-                return false;
-            }
-        default:
-            break;
-        }
+        compute_ctx = ctx->compute_ctx.lock();
      }
  
-    if (!dryrun) {
+    {
          // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
          // to synchronize them. This handles most "normal" synchronization when computing the graph, and when
          // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers
@@ -11744,118 +11660,116 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
          }
      }
  #if ENABLE_SYNC_LOGGING
-    if (!dryrun) {
-        for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
-            auto *n = cgraph->nodes[node_idx + i];
-            std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
-            if (n->op == GGML_OP_GLU) {
-                std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
-            }
-            std::cerr << std::endl;
+    for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
+        auto *n = cgraph->nodes[node_idx + i];
+        std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
+        if (n->op == GGML_OP_GLU) {
+            std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
          }
+        std::cerr << std::endl;
      }
  #endif
  
      switch (node->op) {
      case GGML_OP_REPEAT:
-        ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_repeat(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_REPEAT_BACK:
-        ggml_vk_repeat_back(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_repeat_back(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_ACC:
-        ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_acc(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_GET_ROWS:
-        ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_ADD:
          if (ctx->num_additional_fused_ops) {
-            ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx, dryrun);
+            ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx);
          } else {
-            ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
+            ggml_vk_add(ctx, compute_ctx, src0, src1, node);
          }
          break;
      case GGML_OP_SUB:
-        ggml_vk_sub(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_sub(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_MUL:
-        ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_DIV:
-        ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_div(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_ADD_ID:
-        ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+        ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node);
  
          break;
      case GGML_OP_CONCAT:
-        ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_concat(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_UPSCALE:
-        ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_upscale(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SCALE:
-        ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_scale(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SQR:
-        ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_sqr(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SQRT:
-        ggml_vk_sqrt(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_sqrt(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SIN:
-        ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_sin(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_COS:
-        ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_cos(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_CLAMP:
-        ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_clamp(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_PAD:
-        ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_pad(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_ROLL:
-        ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_roll(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_CPY:
      case GGML_OP_CONT:
      case GGML_OP_DUP:
-        ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_cpy(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SET_ROWS:
-        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_SILU_BACK:
-        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_NORM:
-        ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_norm(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_GROUP_NORM:
-        ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_group_norm(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_RMS_NORM:
@@ -11863,17 +11777,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
              // fused rms_norm + mul
              ggml_tensor *mul = cgraph->nodes[node_idx + 1];
              ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0];
-            ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params, dryrun);
+            ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params);
          } else {
-            ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params, dryrun);
+            ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params);
          }
          break;
      case GGML_OP_RMS_NORM_BACK:
-        ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_L2_NORM:
-        ggml_vk_l2_norm(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_l2_norm(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_UNARY:
@@ -11888,7 +11802,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
          case GGML_UNARY_OP_SIGMOID:
          case GGML_UNARY_OP_HARDSIGMOID:
          case GGML_UNARY_OP_HARDSWISH:
-            ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
+            ggml_vk_unary(ctx, compute_ctx, src0, node);
              break;
          default:
              return false;
@@ -11902,151 +11816,147 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
          case GGML_GLU_OP_SWIGLU_OAI:
          case GGML_GLU_OP_GEGLU_ERF:
          case GGML_GLU_OP_GEGLU_QUICK:
-            ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun);
+            ggml_vk_glu(ctx, compute_ctx, src0, src1, node);
              break;
          default:
              return false;
          }
          break;
      case GGML_OP_DIAG_MASK_INF:
-        ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SOFT_MAX:
          if (ctx->num_additional_fused_ops) {
-            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
+            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
          } else {
-            ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+            ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node);
          }
  
          break;
      case GGML_OP_SOFT_MAX_BACK:
-        ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_ROPE:
-        ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false, dryrun);
+        ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false);
  
          break;
      case GGML_OP_ROPE_BACK:
-        ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true, dryrun);
+        ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true);
  
          break;
      case GGML_OP_ARGSORT:
          if (ctx->num_additional_fused_ops) {
-            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
+            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
          } else {
-            ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
+            ggml_vk_argsort(ctx, compute_ctx, src0, node);
          }
  
          break;
      case GGML_OP_SUM:
-        ggml_vk_sum(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_sum(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_SUM_ROWS:
-        ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_sum_rows(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_MEAN:
-        ggml_vk_mean(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_mean(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_ARGMAX:
-        ggml_vk_argmax(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_argmax(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_COUNT_EQUAL:
-        ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_IM2COL:
-        ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_im2col(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_IM2COL_3D:
-        ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_TIMESTEP_EMBEDDING:
-        ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_CONV_TRANSPOSE_1D:
-        ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_POOL_2D:
-        ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_pool_2d(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_CONV_2D:
-        ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_CONV_TRANSPOSE_2D:
-        ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_CONV_2D_DW:
-        ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
+        ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node);
  
          break;
      case GGML_OP_LEAKY_RELU:
-        ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
+        ggml_vk_leaky_relu(ctx, compute_ctx, src0, node);
  
          break;
      case GGML_OP_MUL_MAT:
-        ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx);
  
          break;
      case GGML_OP_MUL_MAT_ID:
-        ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx, dryrun);
+        ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx);
  
          break;
  
      case GGML_OP_FLASH_ATTN_EXT:
-        ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node, dryrun);
+        ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node);
  
          break;
  
      case GGML_OP_RWKV_WKV6:
-        ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
+        ggml_vk_rwkv_wkv6(ctx, compute_ctx, node);
  
          break;
  
      case GGML_OP_RWKV_WKV7:
-        ggml_vk_rwkv_wkv7(ctx, compute_ctx, node, dryrun);
+        ggml_vk_rwkv_wkv7(ctx, compute_ctx, node);
  
          break;
  
      case GGML_OP_SSM_SCAN:
-        ggml_vk_ssm_scan(ctx, compute_ctx, node, dryrun);
+        ggml_vk_ssm_scan(ctx, compute_ctx, node);
  
          break;
  
      case GGML_OP_SSM_CONV:
-        ggml_vk_ssm_conv(ctx, compute_ctx, node, dryrun);
+        ggml_vk_ssm_conv(ctx, compute_ctx, node);
  
          break;
  
      case GGML_OP_OPT_STEP_ADAMW:
-        ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
+        ggml_vk_opt_step_adamw(ctx, compute_ctx, node);
  
          break;
  
      case GGML_OP_OPT_STEP_SGD:
-        ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+        ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node);
  
          break;
      default:
          return false;
      }
  
-    if (dryrun) {
-        return false;
-    }
-
      ctx->tensor_ctxs[node_idx] = compute_ctx;
  
  #if defined(GGML_VULKAN_CHECK_RESULTS)
@@ -12919,58 +12829,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
          vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
      }
  
-    ctx->prealloc_size_add_rms_partials = 0;
      ctx->prealloc_size_add_rms_partials_offset = 0;
      ctx->do_add_rms_partials = false;
-
-    uint64_t total_mat_mul_bytes = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (!ctx->device->disable_fusion) {
-            uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
-            if (num_adds) {
-                ctx->num_additional_fused_ops = num_adds - 1;
-            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
-                ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
-                ctx->num_additional_fused_ops = 1;
-            } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
-                       ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
-                       ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
-                ctx->num_additional_fused_ops = 2;
-            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
-                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
-                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
-                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
-            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
-                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
-                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
-                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
-            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
-                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
-                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
-                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
-            }
-        }
-        ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
-        if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
-            total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
-        } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D || cgraph->nodes[i]->op == GGML_OP_CONV_TRANSPOSE_2D) {
-            // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
-            auto CRS_size =
-                cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[1]->ne[2];
-            auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3];
-            total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type);
-        }
-        i += ctx->num_additional_fused_ops;
-        ctx->num_additional_fused_ops = 0;
-    }
-    if (ctx->device->need_compiles) {
-        ggml_vk_load_shaders(ctx->device);
-    }
-    ggml_vk_preallocate_buffers(ctx);
-    ggml_pipeline_allocate_descriptor_sets(ctx);
+    ctx->do_add_rms_partials_offset_calculation = false;
  
      int last_node = cgraph->n_nodes - 1;
  
@@ -13012,6 +12873,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
      ctx->prealloc_y_last_tensor_used = nullptr;
  
      if (ctx->prealloc_size_add_rms_partials) {
+        ggml_vk_preallocate_buffers(ctx, nullptr);
          if (ctx->compute_ctx.expired()) {
              compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
              ctx->compute_ctx = compute_ctx;
@@ -13032,14 +12894,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
      int submitted_nodes = 0;
      int submit_count = 0;
      uint64_t mul_mat_bytes = 0;
-    uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), total_mat_mul_bytes / 40u);
+    uint64_t total_mul_mat_bytes = 0;
+    uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u);
      for (int i = 0; i < cgraph->n_nodes; i++) {
          if (first_node_in_batch) {
              submit_node_idx = i;
          }
  
          if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
-            mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
+            auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]);
+            mul_mat_bytes += bytes;
+            total_mul_mat_bytes += bytes;
          }
  
          if (!ctx->device->disable_fusion) {
@@ -13081,11 +12946,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
          // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
          bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
          bool submit = (submitted_nodes >= nodes_per_submit) ||
-                      (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
+                      (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
                        (i + ctx->num_additional_fused_ops >= last_node) ||
                        (almost_ready && !ctx->almost_ready_fence_pending);
  
-        bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
  
          if (vk_perf_logger_enabled) {
              if (ctx->compute_ctx.expired()) {
@@ -13125,6 +12990,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
          ctx->fused_ops_write_mask = 0;
      }
  
+    ctx->prealloc_size_add_rms_partials = std::max(ctx->prealloc_size_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
+    ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
+
      if (vk_perf_logger_enabled) {
          // End the command buffer and submit/wait
          GGML_ASSERT(!ctx->compute_ctx.expired());
author	Jeff Bolz <redacted>
	Tue, 4 Nov 2025 19:28:17 +0000 (13:28 -0600)
committer	Georgi Gerganov <redacted>
	Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)