#define VK_VENDOR_ID_INTEL 0x8086
#define VK_VENDOR_ID_NVIDIA 0x10de
-#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
+#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
#define GGML_VK_MAX_NODES 8192
bool transfer_only;
};
+#define MAX_PARAMETER_COUNT 8
+
struct vk_pipeline_struct {
std::string name;
vk::ShaderModule shader_module;
- vk::DescriptorSetLayout dsl;
- std::vector<vk::DescriptorPool> descriptor_pools;
- std::vector<vk::DescriptorSet> descriptor_sets;
- uint32_t descriptor_set_idx;
vk::PipelineLayout layout;
vk::Pipeline pipeline;
uint32_t push_constant_size;
// set to true to indicate that some shaders need to be compiled after the dryrun
bool need_compiles {};
+ vk::DescriptorSetLayout dsl;
+
vk_matmul_pipeline pipeline_matmul_f32 {};
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
vk_matmul_pipeline pipeline_matmul_bf16 {};
vk_pipeline pipeline_flash_attn_split_k_reduce;
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
- std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
}
pipelines.clear();
+ device.destroyDescriptorSetLayout(dsl);
+
device.destroy();
}
};
vk_context_ref transfer_ctx;
std::vector<vk_context_ref> tensor_ctxs;
+
+ std::vector<vk::DescriptorPool> descriptor_pools;
+ std::vector<vk::DescriptorSet> descriptor_sets;
+ uint32_t descriptor_set_idx {};
+ uint32_t pipeline_descriptor_set_requirements {};
};
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
GGML_ASSERT(parameter_count > 0);
+ GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
- std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
- std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
- for (uint32_t i = 0; i < parameter_count; i++) {
- dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
- dsl_binding_flags.push_back({});
- }
-
- vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
-
vk::PushConstantRange pcr(
vk::ShaderStageFlagBits::eCompute,
0,
pipeline->push_constant_size
);
- vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
- {},
- dsl_binding);
- descriptor_set_layout_create_info.setPNext(&dslbfci);
- pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
-
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
-
- pipeline->descriptor_set_idx = 0;
-
- vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
+ vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
- for (auto& pool : pipeline->descriptor_pools) {
- device.destroyDescriptorPool(pool);
- }
- pipeline->descriptor_pools.clear();
- pipeline->descriptor_sets.clear();
- pipeline->descriptor_set_idx = 0;
-
- device.destroyDescriptorSetLayout(pipeline->dsl);
-
device.destroyPipelineLayout(pipeline->layout);
device.destroyShaderModule(pipeline->shader_module);
device.destroyPipeline(pipeline->pipeline);
}
-static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
+static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
- device->pipeline_descriptor_set_requirements[pipeline->name] += n;
+ ctx->pipeline_descriptor_set_requirements += n;
if (!pipeline->compiled) {
pipeline->needed = true;
- device->need_compiles = true;
+ ctx->device->need_compiles = true;
}
}
-static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
- std::lock_guard<std::mutex> guard(device->mutex);
-
- for (auto& pair : device->pipeline_descriptor_set_requirements) {
- vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
- const uint64_t n = pair.second;
-
- VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
+static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
- if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
- // Enough descriptors are available
- continue;
- }
+ if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
+ // Enough descriptors are available
+ return;
+ }
- uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
- uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
- uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+ vk_device& device = ctx->device;
- while (to_alloc > 0) {
- const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
- to_alloc -= alloc_count;
- pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+ uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
+ uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+ uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
- if (pool_idx >= pipeline->descriptor_pools.size()) {
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
- }
+ while (to_alloc > 0) {
+ const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
+ to_alloc -= alloc_count;
+ pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
- std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
- for (uint32_t i = 0; i < alloc_count; i++) {
- layouts[i] = pipeline->dsl;
- }
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
- std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
- pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
+ if (pool_idx >= ctx->descriptor_pools.size()) {
+ vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
+ vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
+ ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
+ }
- pool_idx++;
+ std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
+ for (uint32_t i = 0; i < alloc_count; i++) {
+ layouts[i] = device->dsl;
}
- }
-}
+ vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
+ std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+ ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
-static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
- VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
- pipeline->descriptor_set_idx = 0;
+ pool_idx++;
+ }
}
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
}
}
+
+ std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
+ std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
+ for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
+ dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
+ dsl_binding_flags.push_back({});
+ }
+
+ vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
+
+ vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
+ {},
+ dsl_binding);
+ descriptor_set_layout_create_info.setPNext(&dslbfci);
+ device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+
ggml_vk_load_shaders(device);
if (!device->single_queue) {
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
- GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
- GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
+ GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
+ GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+ vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
}
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (qx_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
if (quantize_y) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
}
if (split_k > 1) {
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
}
return;
}
// Request descriptor sets
if (qx_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
return;
}
if (dryrun) {
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
return;
}
if (dryrun) {
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
return;
}
}
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (qx_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
return;
}
// Request descriptor sets
if (qx_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
return;
}
if (dryrun) {
// Request descriptor sets
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (split_k > 1) {
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
}
return;
}
}
if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
GGML_ASSERT(pipeline != nullptr);
if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
GGML_ASSERT(pipeline != nullptr);
if (dryrun) {
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
}
}
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer
ggml_vk_load_shaders(ctx->device);
}
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
+ ggml_pipeline_allocate_descriptor_sets(ctx);
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
ggml_vk_destroy_buffer(d_Y);
ggml_vk_destroy_buffer(d_D);
- ggml_pipeline_cleanup(p);
- ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
-
free(x);
free(y);
free(d);
ggml_vk_quantize_data(x, qx, ne, quant);
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
- ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, p, 1);
if (ctx->device->need_compiles) {
ggml_vk_load_shaders(ctx->device);
}
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
+ ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
//
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
//
-// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
+// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
//
// if (ctx->device->need_compiles) {
// ggml_vk_load_shaders(ctx->device);
// }
//
-// ggml_pipeline_allocate_descriptor_sets(ctx->device);
+// ggml_pipeline_allocate_descriptor_sets(ctx);
//
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
//
// y[i] = i % k;
}
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer
}
}
if (mmq) {
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
}
if (ctx->device->need_compiles) {
ggml_vk_load_shaders(ctx->device);
}
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
+ ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
// These operations all go through ggml_vk_op_f32, so short-circuit and
// do the only thing needed for the dryrun.
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return false;
}
default:
}
ctx->gc.temp_buffers.clear();
- for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
- vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
-
- if (plr.expired()) {
- continue;
- }
-
- vk_pipeline pl = plr.lock();
- ggml_pipeline_cleanup(pl);
- }
-
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
ctx->tensor_ctxs.clear();
ctx->gc.contexts.clear();
- ctx->device->pipeline_descriptor_set_requirements.clear();
+ ctx->pipeline_descriptor_set_requirements = 0;
+ ctx->descriptor_set_idx = 0;
}
// Clean up on backend free
ctx->device->device.destroyFence(ctx->fence);
ctx->device->device.destroyFence(ctx->almost_ready_fence);
+
+ for (auto& pool : ctx->descriptor_pools) {
+ ctx->device->device.destroyDescriptorPool(pool);
+ }
+ ctx->descriptor_pools.clear();
+ ctx->descriptor_sets.clear();
}
static int ggml_vk_get_device_count() {
ggml_vk_load_shaders(ctx->device);
}
ggml_vk_preallocate_buffers(ctx);
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
+ ggml_pipeline_allocate_descriptor_sets(ctx);
int last_node = cgraph->n_nodes - 1;