uint32_t align;
// true if fields have been set by ggml_vk_create_pipeline
bool initialized {};
- // set to true to request the pipeline is compiled after the dryrun
+ // set to true to request the pipeline is compiled
bool needed {};
// set to true when the shader has been compiled
bool compiled {};
bool mul_mat_id_m[GGML_TYPE_COUNT];
bool mul_mat_id_s[GGML_TYPE_COUNT];
- // set to true to indicate that some shaders need to be compiled after the dryrun
- bool need_compiles {};
-
vk::DescriptorSetLayout dsl;
vk_matmul_pipeline pipeline_matmul_f32 {};
std::vector<vk_context> contexts;
};
+static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
+static void ggml_vk_load_shaders(vk_device& device);
+static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
+
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
bool almost_ready_fence_pending {};
// Set before op_add and unset after op_rms_norm to indicate that the add should
// write partial sums to accumulate the square of the vector components
+ bool do_add_rms_partials_offset_calculation;
bool do_add_rms_partials;
+ uint64_t last_total_mul_mat_bytes {};
+
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
const ggml_tensor * prealloc_y_last_tensor_used {};
ctx->pipeline_descriptor_set_requirements += n;
if (!pipeline->compiled) {
pipeline->needed = true;
- ctx->device->need_compiles = true;
+ ggml_vk_load_shaders(ctx->device);
}
+ ggml_pipeline_allocate_descriptor_sets(ctx);
}
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
vk_device& device = ctx->device;
- uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
+ // Grow by 50% to avoid frequent allocations
+ uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
+ uint32_t to_alloc = needed - ctx->descriptor_sets.size();
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
for (auto &c : compiles) {
c.wait();
}
- device->need_compiles = false;
}
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
ctx->prealloc_size_x = 0;
ctx->prealloc_size_y = 0;
ctx->prealloc_size_split_k = 0;
+ ctx->prealloc_size_add_rms_partials = 0;
ctx->fence = ctx->device->device.createFence({});
ctx->almost_ready_fence = ctx->device->device.createFence({});
ggml_vk_sync_buffers(ctx, subctx);
}
-static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k, bool dryrun = false) {
+static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) {
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "))");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
}
- if (dryrun) {
+ {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (quantize_y) {
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
ctx->prealloc_size_x = x_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
ctx->prealloc_size_y = y_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
ctx->prealloc_size_split_k = split_k_size;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
// Request descriptor sets
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
}
- return;
}
vk_buffer d_D = dst_buf_ctx->dev_buffer;
GGML_UNUSED(k);
}
-static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
+ std::cerr << ")),)");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
const uint64_t d_sz = sizeof(float) * d_ne;
- if (dryrun) {
+ {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (quantize_y) {
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
ctx->prealloc_size_x = x_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
ctx->prealloc_size_y = y_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
}
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
- return;
}
vk_buffer d_D;
}
}
-static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "))");
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
gqa_ratio = 1;
}
- if (dryrun) {
+ {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
- return;
}
vk_buffer d_D;
}, pc, { 1, (uint32_t)ne01, workgroups_z });
}
-static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "))");
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0));
const uint64_t qy_sz = ggml_nbytes(src1);
const uint64_t d_sz = sizeof(float) * d_ne;
- if (dryrun) {
+ {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
- return;
}
vk_buffer d_D;
}, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
}
-static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
dst2.ne[0] = cur_M_size;
src02.ne[1] = cur_M_size;
- ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true, dryrun);
+ ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true);
m_offset += cur_M_size;
}
src1->nb[1] <= src1->nb[3] &&
src0->ne[3] == 1 &&
src1->ne[3] == 1) {
- ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx);
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
- ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx);
// mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
// when ne12 and ne13 are one.
} else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) {
- ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx);
} else {
- ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false, dryrun);
+ ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false);
}
}
-static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true);
}
- if (dryrun) {
+ {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (quantize_y) {
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
ctx->prealloc_size_x = x_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
ctx->prealloc_size_y = y_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
// Request descriptor sets
if (quantize_y) {
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
}
- return;
}
vk_buffer d_D = dst_buf_ctx->dev_buffer;
}
}
-static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "))");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
GGML_ASSERT(dmmv != nullptr);
- if (dryrun) {
+ {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
ctx->prealloc_size_x = x_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
ctx->prealloc_size_y = y_sz_upd;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
- return;
}
vk_buffer d_D;
return src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type));
}
-static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
ggml_tensor * dst = cgraph->nodes[node_idx];
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
ggml_tensor * src2 = dst->src[2];
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) {
- ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx);
} else {
- ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
}
}
return supported;
}
-static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
if (sinks) {
std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3];
}
- std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "))");
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
}
if (ctx->prealloc_size_split_k < split_k_size) {
ctx->prealloc_size_split_k = split_k_size;
+ ggml_vk_preallocate_buffers(ctx, subctx);
}
- if (dryrun) {
+ {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
}
- return;
}
float scale = 1.0f;
}
template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) {
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
if (src1 != nullptr) {
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3];
}
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
- std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
+ std::cerr << "), " << ggml_op_name(op) << ")");
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
GGML_ASSERT(dst->buffer != nullptr);
GGML_ABORT("fatal error");
}
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
}
}
-static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, 0,
- }, dryrun);
+ });
}
-static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, offset,
- }, dryrun);
+ });
}
-static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
const ggml_tensor *first_node = cgraph->nodes[node_idx];
const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
GGML_ABORT("fatal error");
}
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT];
vk_buffer buf[MAX_PARAMETER_COUNT];
}, pc, elements);
}
-static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, ctx->do_add_rms_partials,
- }, dryrun);
+ });
}
-static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, 0,
- }, dryrun);
+ });
}
-static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, 0,
- }, dryrun);
+ });
}
-static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, 0,
- }, dryrun);
+ });
}
-static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t src2_type_size = ggml_type_size(src2->type);
(uint32_t)src0->nb[2] / src0_type_size,
(uint32_t)src1->nb[1] / src1_type_size,
(uint32_t)src2->nb[1] / src2_type_size,
- }, dryrun);
+ });
}
-static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) {
+static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) {
GGML_ASSERT(version == 6 || version == 7);
int num_srcs = version == 6 ? 6 : 7;
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op);
GGML_ASSERT(pipeline != nullptr);
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
}
}
-static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const size_t seq_length = dst->src[0]->ne[2];
const size_t n_embed = dst->ne[0];
const size_t n_heads = dst->src[0]->ne[1];
(uint32_t)n_embed,
(uint32_t)n_heads,
},
- 6,
- dryrun
+ 6
);
}
-static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const size_t seq_length = dst->src[0]->ne[2];
const size_t n_embed = dst->ne[0];
const size_t n_heads = dst->src[0]->ne[1];
(uint32_t)n_embed,
(uint32_t)n_heads,
},
- 7,
- dryrun
+ 7
);
}
-static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op);
GGML_ASSERT(pipeline != nullptr);
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
const int64_t s_off = ggml_nelements(src1) * sizeof(float);
}, pc, elements);
}
-static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
(uint32_t)src0->ne[1],
(uint32_t)dst->ne[1],
(uint32_t)dst->ne[2],
- }, dryrun);
+ });
}
-static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc, bool dryrun = false) {
+static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) {
const ggml_tensor * x = dst->src[0];
const ggml_tensor * g = dst->src[1];
const ggml_tensor * gm = dst->src[2];
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW);
GGML_ASSERT(pipeline != nullptr);
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
ggml_backend_vk_buffer_context * x_buf_ctx = (ggml_backend_vk_buffer_context *)x->buffer->context;
ggml_backend_vk_buffer_context * g_buf_ctx = (ggml_backend_vk_buffer_context *)g->buffer->context;
}, pc, elements);
}
-static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const size_t n = ggml_nelements(dst->src[0]);
ggml_vk_op_f32_opt_step_adamw(
ctx, subctx, dst,
- { (uint32_t)n, 0, 0.0f, 0.0f },
- dryrun
+ { (uint32_t)n, 0, 0.0f, 0.0f }
);
}
-static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const size_t n = ggml_nelements(dst->src[0]);
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f });
}
-static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
int * op_params = (int *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, op_params[0],
- }, dryrun);
+ });
}
-static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
(uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size,
(uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
sf0, sf1, sf2, sf3, pixel_offset
- }, dryrun);
+ });
}
-static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
p.param1 = ggml_get_op_params_f32(dst, 0);
p.param2 = ggml_get_op_params_f32(dst, 1);
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p));
}
-static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst));
}
-static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst));
}
-static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst));
}
-static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst));
}
-static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
p.param1 = ggml_get_op_params_f32(dst, 0);
p.param2 = ggml_get_op_params_f32(dst, 1);
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p));
}
-static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst);
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p));
}
-static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
const int32_t s2 = ggml_get_op_params_i32(dst, 2);
memcpy(&p.param1, &s01_packed, sizeof(float));
memcpy(&p.param2, &s23_packed, sizeof(float));
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p));
}
-static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p));
}
-static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p));
}
-static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
uint32_t ne = (uint32_t)ggml_nelements(src0);
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
// Convert from number of logical elements to 2- or 4-byte units.
}
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p));
}
-static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f, 0,
- }, dryrun);
+ });
}
-static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
}
-static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const int * int_op_params = (const int *)dst->op_params;
const float * float_op_params = (const float *)dst->op_params;
const float eps = float_op_params[1];
const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f });
}
static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
return num_bytes;
}
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) {
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
op_params[0], 0.0f, (int32_t)param3,
- }, dryrun);
+ });
- if (ctx->do_add_rms_partials) {
+ if (ctx->do_add_rms_partials_offset_calculation) {
ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0);
ctx->do_add_rms_partials = false;
+ ctx->do_add_rms_partials_offset_calculation = false;
}
}
-static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
}
-static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const float * op_params_f = (const float *)dst->op_params;
const bool swapped = (bool)dst->op_params[1];
mode,
alpha,
limit
- }, dryrun);
+ });
}
-static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
- ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
+ ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
}
-static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
float scale = op_params[0];
n_head_log2,
nrows_x,
src2 != nullptr
- }, dryrun);
+ });
}
-static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] });
}
-static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
-
+static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX);
- if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- return;
- }
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context;
ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context;
}, pc, elements);
}
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop, bool dryrun = false) {
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
ggml_tensor * dst = cgraph->nodes[node_idx];
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
{ sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
- }, dryrun);
+ });
}
-static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
uint32_t ncols = src0->ne[0];
ncols,
nrows,
op_params[0],
- }, dryrun);
+ });
}
-static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0));
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p);
}
-static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p);
}
-static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
p.weight = 1.0f / (float)src0->ne[0];
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p);
}
-static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
+static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
}
-static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
}
-static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int32_t s0 = dst->op_params[0];
const int32_t s1 = dst->op_params[1];
const int32_t p0 = dst->op_params[2];
pelements,
IC * KH * KW,
s0, s1, p0, p1, d0, d1,
- }, dryrun);
+ });
}
-static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_TENSOR_BINARY_OP_LOCALS
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
- ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun);
+ ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc));
}
-static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t dim = dst->op_params[0];
const uint32_t max_period = dst->op_params[1];
const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
nb1, dim, max_period,
- }, dryrun);
+ });
}
-static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// src0: (K, Cout, Cin, 1) -- kernel
// src1: (L, Cin, 1, 1) -- input
// dst: (*, Cout, 1, 1)
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
p.s0 = static_cast<uint32_t>(s0);
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
}
-static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
const int32_t k1 = dst->op_params[1];
const int32_t k0 = dst->op_params[2];
parallel_elements,
op,
k0, k1, s0, s1, p0, p1,
- }, dryrun);
+ });
}
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
- const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+ const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ne03 == ne2);
GGML_ASSERT(ne02 == ne12);
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p));
}
static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
- const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+ const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ne02 == ne2);
GGML_ASSERT(ne03 == ne12);
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p));
}
-static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
vk_op_conv2d_dw_push_constants p{};
p.ne = ggml_nelements(dst);
p.channels = dst->ne[2];
GGML_ASSERT(src0->ne[3] == p.channels);
GGML_ASSERT(src1->ne[3] == p.batches);
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun);
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p));
}
-static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const float * op_params = (const float *)dst->op_params;
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
}
#ifdef GGML_VULKAN_RUN_TESTS
}
}
- if (ctx->device->need_compiles) {
- ggml_vk_load_shaders(ctx->device);
- }
-
ggml_pipeline_allocate_descriptor_sets(ctx);
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal});
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
- if (ctx->device->need_compiles) {
- ggml_vk_load_shaders(ctx->device);
- }
-
ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
//
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
//
-// if (ctx->device->need_compiles) {
-// ggml_vk_load_shaders(ctx->device);
-// }
-//
// ggml_pipeline_allocate_descriptor_sets(ctx);
//
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
}
- if (ctx->device->need_compiles) {
- ggml_vk_load_shaders(ctx->device);
- }
-
ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
}
#endif
-static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
+static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) {
#if defined(GGML_VULKAN_RUN_TESTS)
const std::vector<size_t> vals {
512, 512, 128,
GGML_ABORT("fatal error");
#endif
+ if (subctx) {
+ // Submit and wait for any pending work before reallocating the buffers
+ ggml_vk_ctx_end(subctx);
+ ggml_vk_submit(subctx, ctx->fence);
+ ggml_vk_wait_for_fence(ctx);
+ ggml_vk_ctx_begin(ctx->device, subctx);
+ }
+
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
// Resize buffer
// Returns true if node has enqueued work into the queue, false otherwise
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
-static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){
ggml_tensor * node = cgraph->nodes[node_idx];
if (ggml_is_empty(node) || !node->buffer) {
return false;
cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] &&
ggml_nrows(cgraph->nodes[next_node_idx]) == 1 &&
ctx->device->add_rms_fusion) {
- if (dryrun) {
- ctx->prealloc_size_add_rms_partials += ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
+ uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
+ ctx->do_add_rms_partials_offset_calculation = true;
+ if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) {
+ ctx->do_add_rms_partials = true;
}
- ctx->do_add_rms_partials = true;
}
} break;
case GGML_OP_REPEAT:
vk_context compute_ctx;
- if (!dryrun) {
- if (ctx->compute_ctx.expired()) {
- compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
- ctx->compute_ctx = compute_ctx;
- ggml_vk_ctx_begin(ctx->device, compute_ctx);
- } else {
- compute_ctx = ctx->compute_ctx.lock();
- }
+ if (ctx->compute_ctx.expired()) {
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+ ctx->compute_ctx = compute_ctx;
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
- switch (node->op) {
- case GGML_OP_REPEAT:
- case GGML_OP_REPEAT_BACK:
- case GGML_OP_ACC:
- case GGML_OP_GET_ROWS:
- case GGML_OP_ADD:
- case GGML_OP_SUB:
- case GGML_OP_MUL:
- case GGML_OP_DIV:
- case GGML_OP_CONCAT:
- case GGML_OP_UPSCALE:
- case GGML_OP_SCALE:
- case GGML_OP_SQR:
- case GGML_OP_SQRT:
- case GGML_OP_SIN:
- case GGML_OP_COS:
- case GGML_OP_CLAMP:
- case GGML_OP_PAD:
- case GGML_OP_CPY:
- case GGML_OP_SET_ROWS:
- case GGML_OP_CONT:
- case GGML_OP_DUP:
- case GGML_OP_SILU_BACK:
- case GGML_OP_NORM:
- case GGML_OP_GROUP_NORM:
- case GGML_OP_RMS_NORM:
- case GGML_OP_RMS_NORM_BACK:
- case GGML_OP_L2_NORM:
- case GGML_OP_UNARY:
- case GGML_OP_GLU:
- case GGML_OP_DIAG_MASK_INF:
- case GGML_OP_SOFT_MAX:
- case GGML_OP_SOFT_MAX_BACK:
- case GGML_OP_ROPE_BACK:
- case GGML_OP_ARGSORT:
- case GGML_OP_SUM:
- case GGML_OP_SUM_ROWS:
- case GGML_OP_MEAN:
- case GGML_OP_ARGMAX:
- case GGML_OP_COUNT_EQUAL:
- case GGML_OP_IM2COL:
- case GGML_OP_IM2COL_3D:
- case GGML_OP_TIMESTEP_EMBEDDING:
- case GGML_OP_CONV_TRANSPOSE_1D:
- case GGML_OP_POOL_2D:
- case GGML_OP_CONV_2D:
- case GGML_OP_CONV_TRANSPOSE_2D:
- case GGML_OP_CONV_2D_DW:
- case GGML_OP_LEAKY_RELU:
- case GGML_OP_OPT_STEP_SGD:
- {
- // These operations all go through ggml_vk_op_f32, so short-circuit and
- // do the only thing needed for the dryrun.
- vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
- ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
- if (node->op == GGML_OP_RMS_NORM) {
- ctx->do_add_rms_partials = false;
- }
- return false;
- }
- default:
- break;
- }
+ compute_ctx = ctx->compute_ctx.lock();
}
- if (!dryrun) {
+ {
// This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
// to synchronize them. This handles most "normal" synchronization when computing the graph, and when
// there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers
}
}
#if ENABLE_SYNC_LOGGING
- if (!dryrun) {
- for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
- auto *n = cgraph->nodes[node_idx + i];
- std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
- if (n->op == GGML_OP_GLU) {
- std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
- }
- std::cerr << std::endl;
+ for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
+ auto *n = cgraph->nodes[node_idx + i];
+ std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
+ if (n->op == GGML_OP_GLU) {
+ std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
}
+ std::cerr << std::endl;
}
#endif
switch (node->op) {
case GGML_OP_REPEAT:
- ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_repeat(ctx, compute_ctx, src0, node);
break;
case GGML_OP_REPEAT_BACK:
- ggml_vk_repeat_back(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_repeat_back(ctx, compute_ctx, src0, node);
break;
case GGML_OP_ACC:
- ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_acc(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_GET_ROWS:
- ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ADD:
if (ctx->num_additional_fused_ops) {
- ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx);
} else {
- ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_add(ctx, compute_ctx, src0, src1, node);
}
break;
case GGML_OP_SUB:
- ggml_vk_sub(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_sub(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_MUL:
- ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_DIV:
- ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_div(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ADD_ID:
- ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+ ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node);
break;
case GGML_OP_CONCAT:
- ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_concat(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_UPSCALE:
- ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_upscale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SCALE:
- ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_scale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SQR:
- ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_sqr(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SQRT:
- ggml_vk_sqrt(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_sqrt(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SIN:
- ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_sin(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COS:
- ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_cos(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
- ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_clamp(ctx, compute_ctx, src0, node);
break;
case GGML_OP_PAD:
- ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_pad(ctx, compute_ctx, src0, node);
break;
case GGML_OP_ROLL:
- ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_roll(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
- ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_cpy(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SET_ROWS:
- ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_SILU_BACK:
- ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_NORM:
- ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_GROUP_NORM:
- ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_group_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_RMS_NORM:
// fused rms_norm + mul
ggml_tensor *mul = cgraph->nodes[node_idx + 1];
ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0];
- ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params, dryrun);
+ ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params);
} else {
- ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params, dryrun);
+ ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params);
}
break;
case GGML_OP_RMS_NORM_BACK:
- ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_L2_NORM:
- ggml_vk_l2_norm(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_l2_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_UNARY:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
- ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_unary(ctx, compute_ctx, src0, node);
break;
default:
return false;
case GGML_GLU_OP_SWIGLU_OAI:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
- ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_glu(ctx, compute_ctx, src0, src1, node);
break;
default:
return false;
}
break;
case GGML_OP_DIAG_MASK_INF:
- ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SOFT_MAX:
if (ctx->num_additional_fused_ops) {
- ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else {
- ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+ ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node);
}
break;
case GGML_OP_SOFT_MAX_BACK:
- ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ROPE:
- ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false, dryrun);
+ ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false);
break;
case GGML_OP_ROPE_BACK:
- ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true, dryrun);
+ ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true);
break;
case GGML_OP_ARGSORT:
if (ctx->num_additional_fused_ops) {
- ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else {
- ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_argsort(ctx, compute_ctx, src0, node);
}
break;
case GGML_OP_SUM:
- ggml_vk_sum(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_sum(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SUM_ROWS:
- ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_sum_rows(ctx, compute_ctx, src0, node);
break;
case GGML_OP_MEAN:
- ggml_vk_mean(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_mean(ctx, compute_ctx, src0, node);
break;
case GGML_OP_ARGMAX:
- ggml_vk_argmax(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_argmax(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COUNT_EQUAL:
- ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_IM2COL:
- ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_im2col(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_IM2COL_3D:
- ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_TIMESTEP_EMBEDDING:
- ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
- ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_POOL_2D:
- ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_pool_2d(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CONV_2D:
- ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_CONV_TRANSPOSE_2D:
- ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_CONV_2D_DW:
- ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
+ ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_LEAKY_RELU:
- ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
+ ggml_vk_leaky_relu(ctx, compute_ctx, src0, node);
break;
case GGML_OP_MUL_MAT:
- ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx);
break;
case GGML_OP_MUL_MAT_ID:
- ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx);
break;
case GGML_OP_FLASH_ATTN_EXT:
- ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node, dryrun);
+ ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node);
break;
case GGML_OP_RWKV_WKV6:
- ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
+ ggml_vk_rwkv_wkv6(ctx, compute_ctx, node);
break;
case GGML_OP_RWKV_WKV7:
- ggml_vk_rwkv_wkv7(ctx, compute_ctx, node, dryrun);
+ ggml_vk_rwkv_wkv7(ctx, compute_ctx, node);
break;
case GGML_OP_SSM_SCAN:
- ggml_vk_ssm_scan(ctx, compute_ctx, node, dryrun);
+ ggml_vk_ssm_scan(ctx, compute_ctx, node);
break;
case GGML_OP_SSM_CONV:
- ggml_vk_ssm_conv(ctx, compute_ctx, node, dryrun);
+ ggml_vk_ssm_conv(ctx, compute_ctx, node);
break;
case GGML_OP_OPT_STEP_ADAMW:
- ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
+ ggml_vk_opt_step_adamw(ctx, compute_ctx, node);
break;
case GGML_OP_OPT_STEP_SGD:
- ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+ ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node);
break;
default:
return false;
}
- if (dryrun) {
- return false;
- }
-
ctx->tensor_ctxs[node_idx] = compute_ctx;
#if defined(GGML_VULKAN_CHECK_RESULTS)
vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
}
- ctx->prealloc_size_add_rms_partials = 0;
ctx->prealloc_size_add_rms_partials_offset = 0;
ctx->do_add_rms_partials = false;
-
- uint64_t total_mat_mul_bytes = 0;
- for (int i = 0; i < cgraph->n_nodes; i++) {
- if (!ctx->device->disable_fusion) {
- uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
- if (num_adds) {
- ctx->num_additional_fused_ops = num_adds - 1;
- } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
- ctx->num_additional_fused_ops = 1;
- } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
- ctx->num_additional_fused_ops = 1;
- } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
- ctx->num_additional_fused_ops = 1;
- } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
- ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
- ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
- ctx->num_additional_fused_ops = 2;
- } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
- ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
- ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
- ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
- } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
- ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
- ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
- ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
- } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
- ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
- ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
- ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
- }
- }
- ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
- if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
- total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
- } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D || cgraph->nodes[i]->op == GGML_OP_CONV_TRANSPOSE_2D) {
- // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
- auto CRS_size =
- cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[1]->ne[2];
- auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3];
- total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type);
- }
- i += ctx->num_additional_fused_ops;
- ctx->num_additional_fused_ops = 0;
- }
- if (ctx->device->need_compiles) {
- ggml_vk_load_shaders(ctx->device);
- }
- ggml_vk_preallocate_buffers(ctx);
- ggml_pipeline_allocate_descriptor_sets(ctx);
+ ctx->do_add_rms_partials_offset_calculation = false;
int last_node = cgraph->n_nodes - 1;
ctx->prealloc_y_last_tensor_used = nullptr;
if (ctx->prealloc_size_add_rms_partials) {
+ ggml_vk_preallocate_buffers(ctx, nullptr);
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
int submitted_nodes = 0;
int submit_count = 0;
uint64_t mul_mat_bytes = 0;
- uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), total_mat_mul_bytes / 40u);
+ uint64_t total_mul_mat_bytes = 0;
+ uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (first_node_in_batch) {
submit_node_idx = i;
}
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
- mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
+ auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]);
+ mul_mat_bytes += bytes;
+ total_mul_mat_bytes += bytes;
}
if (!ctx->device->disable_fusion) {
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
bool submit = (submitted_nodes >= nodes_per_submit) ||
- (mul_mat_bytes >= mul_mat_bytes_per_submit) ||
+ (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
(i + ctx->num_additional_fused_ops >= last_node) ||
(almost_ready && !ctx->almost_ready_fence_pending);
- bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
+ bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
ctx->fused_ops_write_mask = 0;
}
+ ctx->prealloc_size_add_rms_partials = std::max(ctx->prealloc_size_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
+ ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
+
if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait
GGML_ASSERT(!ctx->compute_ctx.expired());