// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT.
#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3)
+typedef std::shared_ptr<struct vk_pipeline_struct> vk_pipeline;
+
struct vk_pipeline_struct {
std::string name;
vk::ShaderModule shader_module;
std::atomic<bool> compiled {};
// number of registers used, extracted from pipeline executable properties
uint32_t register_count {};
+
+#if defined(VK_EXT_shader_64bit_indexing)
+ bool is_64b_indexing {};
+#endif
+ // linked list of pipelines for multiple compilation variants.
+ // currently only used to compile a 64-bit indexing variant.
+ vk_pipeline next;
};
-typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
bool add_rms_fusion;
uint32_t partials_binding_alignment;
+ bool shader_64b_indexing;
+
bool integer_dot_product;
// 0: default, 1: force mmvq, -1: disable mmvq
int32_t mmvq_mode;
compute_pipeline_create_info.setPNext(&rci);
}
+#if defined(VK_EXT_shader_64bit_indexing)
+ vk::PipelineCreateFlags2CreateInfo pipelineFlags2CreateInfo;
+ if (pipeline->is_64b_indexing)
+ {
+ pipelineFlags2CreateInfo.flags = vk::PipelineCreateFlagBits2::e64BitIndexingEXT;
+ if (device->pipeline_executable_properties_support) {
+ pipelineFlags2CreateInfo.flags |= vk::PipelineCreateFlagBits2::eCaptureStatisticsKHR;
+ }
+ pipelineFlags2CreateInfo.setPNext(compute_pipeline_create_info.pNext);
+ compute_pipeline_create_info.setPNext(&pipelineFlags2CreateInfo);
+ }
+#endif
+
try {
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
} catch (const vk::SystemError& e) {
}
std::vector<std::future<void>> compiles;
- auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint,
+ auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
required_subgroup_size = get_subgroup_size(name, device->architecture);
}
- if (!pipeline) {
- pipeline = std::make_shared<vk_pipeline_struct>();
- }
- if (!pipeline->initialized) {
- pipeline->name = name;
- pipeline->parameter_count = parameter_count;
- pipeline->push_constant_size = push_constant_size;
- pipeline->wg_denoms = wg_denoms;
- pipeline->align = align;
- pipeline->initialized = true;
- }
+ vk_pipeline *ptr = &base_pipeline;
- if (!pipeline->needed || pipeline->compiled) {
- return;
+ int num_pipelines = 1;
+#if defined(VK_EXT_shader_64bit_indexing)
+ if (device->shader_64b_indexing) {
+ num_pipelines = 2;
}
- // TODO: We're no longer benefitting from the async compiles (shaders are
- // compiled individually, as needed) and this complexity can be removed.
- {
- // wait until fewer than N compiles are in progress
- uint32_t N = std::max(1u, std::thread::hardware_concurrency());
- std::unique_lock<std::mutex> guard(compile_count_mutex);
- while (compile_count >= N) {
- compile_count_cond.wait(guard);
+#endif
+ for (int i = 0; i < num_pipelines; ++i, ptr = &(*ptr)->next) {
+ vk_pipeline &pipeline = *ptr;
+ if (!pipeline) {
+ pipeline = std::make_shared<vk_pipeline_struct>();
+ }
+ if (!pipeline->initialized) {
+ pipeline->name = name;
+ pipeline->parameter_count = parameter_count;
+ pipeline->push_constant_size = push_constant_size;
+ pipeline->wg_denoms = wg_denoms;
+ pipeline->align = align;
+ pipeline->initialized = true;
+#if defined(VK_EXT_shader_64bit_indexing)
+ pipeline->is_64b_indexing = (i == 1);
+#endif
+ }
+
+ if (!pipeline->needed || pipeline->compiled) {
+ continue;
+ }
+ // TODO: We're no longer benefitting from the async compiles (shaders are
+ // compiled individually, as needed) and this complexity can be removed.
+ {
+ // wait until fewer than N compiles are in progress
+ uint32_t N = std::max(1u, std::thread::hardware_concurrency());
+ std::unique_lock<std::mutex> guard(compile_count_mutex);
+ while (compile_count >= N) {
+ compile_count_cond.wait(guard);
+ }
+ compile_count++;
}
- compile_count++;
- }
- compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
- parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
+ compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
+ parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
+ }
};
auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint,
bool pipeline_executable_properties_support = false;
device->coopmat_support = false;
device->integer_dot_product = false;
+ device->shader_64b_indexing = false;
bool bfloat16_support = false;
for (const auto& properties : ext_props) {
device->memory_priority = true;
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
device->external_memory_host = true;
+#if defined(VK_EXT_shader_64bit_indexing)
+ } else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) {
+ device->shader_64b_indexing = true;
+#endif
}
}
device_extensions.push_back("VK_EXT_external_memory_host");
}
+#if defined(VK_EXT_shader_64bit_indexing)
+ VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {};
+ shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT;
+ if (device->shader_64b_indexing) {
+ last_struct->pNext = (VkBaseOutStructure *)&shader_64bit_indexing_features;
+ last_struct = (VkBaseOutStructure *)&shader_64bit_indexing_features;
+ device_extensions.push_back("VK_EXT_shader_64bit_indexing");
+ }
+#endif
+
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
ggml_vk_sync_buffers(ctx, subctx);
}
+static vk_pipeline ggml_vk_get_64b_indexing_pipeline(ggml_backend_vk_context * ctx, vk_pipeline &pipeline) {
+ GGML_UNUSED(ctx);
+#if defined(VK_EXT_shader_64bit_indexing)
+ vk_pipeline *ptr = &pipeline;
+ while (*ptr) {
+ if ((*ptr)->is_64b_indexing) {
+ return *ptr;
+ }
+ ptr = &(*ptr)->next;
+ }
+#endif
+ return pipeline;
+}
+
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) {
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type));
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
+ }
+
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11;
const uint64_t x_ne = ggml_nelements(src0);
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
}
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv);
+ }
+
const bool qx_needs_dequant = x_non_contig;
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
gqa_ratio = 1;
}
+ vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1];
+
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
+ }
+
{
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
}
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
workgroups_z /= gqa_ratio;
}
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1],
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
d_Qx,
d_Qy,
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
const uint32_t channel_stride_y = nb12 / sizeof(float);
+ vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32;
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
+ }
+
{
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
}
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
d_Qx,
d_Qy,
// Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases
// where the M dimension is very large.
// Split_k doesn't work with M splitting.
+ // This only supports batchsize == 1.
const size_t nbytes = ggml_nbytes(src0);
- const bool needs_split = nbytes > ctx->device->properties.limits.maxStorageBufferRange;
+ const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange;
if (needs_split) {
// Choose the number of rows that can fit (and divide by two, to allow for any additional offsets)
const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]);
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type);
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
+ }
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
const uint64_t x_ne = ggml_nelements(src0);
const bool qx_needs_dequant = x_non_contig;
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
+ if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
+ dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv);
+ }
+
// Not implemented
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT