struct ggml_backend_vk_context;
-struct vk_queue {
- uint32_t queue_family_index;
- vk::Queue queue;
- vk::CommandPool pool;
- uint32_t cmd_buffer_idx;
- std::vector<vk::CommandBuffer> cmd_buffers;
-
- vk::PipelineStageFlags stage_flags;
-
- bool transfer_only;
-};
-
#define MAX_PARAMETER_COUNT 8
struct vk_pipeline_struct {
vk_device device;
};
+struct vk_queue;
+
+// Stores command pool/buffers. There's an instance of this
+// for each (context,queue) pair and for each (device,queue) pair.
+struct vk_command_pool {
+ void init(vk_device& device, vk_queue *q_);
+ void destroy(vk::Device& device);
+
+ vk::CommandPool pool;
+ uint32_t cmd_buffer_idx;
+ std::vector<vk::CommandBuffer> cmd_buffers;
+
+ vk_queue *q;
+};
+
+struct vk_queue {
+ uint32_t queue_family_index;
+ vk::Queue queue;
+
+ vk_command_pool cmd_pool;
+
+ vk::PipelineStageFlags stage_flags;
+
+ bool transfer_only;
+
+ // copy everything except the cmd_pool
+ void copyFrom(vk_queue &other) {
+ queue_family_index = other.queue_family_index;
+ queue = other.queue;
+ stage_flags = other.stage_flags;
+ transfer_only = other.transfer_only;
+ }
+};
+
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
ggml_vk_destroy_buffer(sync_staging);
- device.destroyCommandPool(compute_queue.pool);
- if (!single_queue) {
- device.destroyCommandPool(transfer_queue.pool);
- }
+ compute_queue.cmd_pool.destroy(device);
+ transfer_queue.cmd_pool.destroy(device);
for (auto& pipeline : pipelines) {
if (pipeline.second.expired()) {
}
};
+void vk_command_pool::init(vk_device& device, vk_queue *q_) {
+ cmd_buffer_idx = 0;
+ q = q_;
+
+ vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
+ pool = device->device.createCommandPool(command_pool_create_info);
+}
+
+void vk_command_pool::destroy(vk::Device& device) {
+ device.destroyCommandPool(pool);
+ pool = nullptr;
+ cmd_buffers.clear();
+}
+
struct vk_buffer_struct {
vk::Buffer buffer = VK_NULL_HANDLE;
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
std::vector<vk_staging_memcpy> in_memcpys;
std::vector<vk_staging_memcpy> out_memcpys;
- vk_queue * q;
+ vk_command_pool * p {};
};
typedef std::shared_ptr<vk_context_struct> vk_context;
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
std::vector<vk::DescriptorSet> descriptor_sets;
uint32_t descriptor_set_idx {};
uint32_t pipeline_descriptor_set_requirements {};
+
+ vk_command_pool compute_cmd_pool;
+ vk_command_pool transfer_cmd_pool;
};
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
}
}
-static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
+static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
- std::lock_guard<std::mutex> guard(device->mutex);
- if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
+ if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
// Reuse command buffer
- return q.cmd_buffers[q.cmd_buffer_idx++];
+ return p.cmd_buffers[p.cmd_buffer_idx++];
}
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
- q.pool,
+ p.pool,
vk::CommandBufferLevel::ePrimary,
1);
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
auto buf = cmd_buffers.front();
- q.cmd_buffers.push_back(buf);
- q.cmd_buffer_idx++;
+ p.cmd_buffers.push_back(buf);
+ p.cmd_buffer_idx++;
return buf;
}
-static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
- VK_LOG_DEBUG("ggml_vk_create_submission()");
- vk_submission s;
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
- s.wait_semaphores = std::move(wait_semaphores);
- s.signal_semaphores = std::move(signal_semaphores);
- return s;
-}
-
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
if (fence) {
- ctx->q->queue.submit({}, fence);
+ ctx->p->q->queue.submit({}, fence);
}
return;
}
tl_signal_vals.push_back({});
tl_signal_semaphores.push_back({});
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
- stage_flags[idx].push_back(ctx->q->stage_flags);
+ stage_flags[idx].push_back(ctx->p->q->stage_flags);
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
}
}
}
- ctx->q->queue.submit(submit_infos, fence);
+ ctx->p->q->queue.submit(submit_infos, fence);
ctx->seqs.clear();
}
q.queue_family_index = queue_family_index;
q.transfer_only = transfer_only;
- vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
- q.pool = device->device.createCommandPool(command_pool_create_info_compute);
-
- q.cmd_buffer_idx = 0;
+ q.cmd_pool.init(device, &q);
q.queue = device->device.getQueue(queue_family_index, queue_index);
q.stage_flags = stage_flags;
}
-static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
+static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
ctx->gc.contexts.emplace_back(result);
- result->q = &q;
+ result->p = &p;
return result;
}
-static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
+static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
- result->q = &q;
+ result->p = &p;
return result;
}
return ctx->gc.events[ctx->event_idx++];
}
-static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
- VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
- std::lock_guard<std::mutex> guard(device->mutex);
+static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
+ VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
// Requires command buffers to be done
- device->device.resetCommandPool(q.pool);
- q.cmd_buffer_idx = 0;
+ device->device.resetCommandPool(p.pool);
+ p.cmd_buffer_idx = 0;
+}
+
+static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
+ VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
+
+ // Arbitrary frequency to cleanup/reuse command buffers
+ static constexpr uint32_t cleanup_frequency = 10;
+
+ if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+ ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
+ }
+ if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+ ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
+ }
}
+
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
vk::MemoryType memory_type = mem_props->memoryTypes[i];
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
}
- std::lock_guard<std::mutex> guard(device->mutex);
-
vk_buffer buf = std::make_shared<vk_buffer_struct>();
if (size == 0) {
static void ggml_vk_sync_buffers(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
- const bool transfer_queue = ctx->q->transfer_only;
+ const bool transfer_queue = ctx->p->q->transfer_only;
ctx->s->buffer.pipelineBarrier(
- ctx->q->stage_flags,
- ctx->q->stage_flags,
+ ctx->p->q->stage_flags,
+ ctx->p->q->stage_flags,
{},
{ {
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
ctx->s->buffer.waitEvents(
events,
- ctx->q->stage_flags,
- ctx->q->stage_flags,
+ ctx->p->q->stage_flags,
+ ctx->p->q->stage_flags,
{},
{},
{}
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
} else {
// TODO: Use pointer or reference to avoid copy
- device->transfer_queue = device->compute_queue;
+ device->transfer_queue.copyFrom(device->compute_queue);
+ device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
}
device->buffer_type = {
ctx->fence = ctx->device->device.createFence({});
ctx->almost_ready_fence = ctx->device->device.createFence({});
+ ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
+ ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
+
#ifdef GGML_VULKAN_CHECK_RESULTS
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
}
}
-static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
+static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
vk_submission s;
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
+ s.buffer = ggml_vk_create_cmd_buffer(device, p);
if (one_time) {
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
} else {
ggml_vk_ctx_end(subctx);
}
- subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
+ subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
}
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
}
} else {
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
+
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
+ ggml_vk_queue_command_pools_cleanup(dst->device);
}
}
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ std::lock_guard<std::mutex> guard(src->device->mutex);
+
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
src->device->device.resetFences({ src->device->fence });
+ ggml_vk_queue_command_pools_cleanup(src->device);
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
if (src->device == dst->device) {
+ std::lock_guard<std::mutex> guard(src->device->mutex);
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
src->device->device.resetFences({ src->device->fence });
+ ggml_vk_queue_command_pools_cleanup(src->device);
} else {
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
// Copy device to device
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
dst->device->device.resetFences({ dst->device->fence });
+ ggml_vk_queue_command_pools_cleanup(dst->device);
}
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_matmul(
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
ctx->device->device.resetFences({ ctx->fence });
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
free(d_chk);
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
ggml_vk_destroy_buffer(d_X);
ggml_vk_destroy_buffer(d_Y);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
ctx->device->device.resetFences({ ctx->fence });
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
//
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
//
-// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
// ggml_vk_ctx_begin(ctx->device, subctx);
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
// ggml_vk_ctx_end(subctx);
// ggml_vk_submit(subctx, ctx->fence);
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
// ctx->device->device.resetFences({ ctx->fence });
+// ggml_vk_queue_command_pools_cleanup(ctx->device);
//
// auto end = std::chrono::high_resolution_clock::now();
//
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
if (mmq) {
for (size_t i = 0; i < num_it; i++) {
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
ctx->device->device.resetFences({ ctx->fence });
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
if (!dryrun) {
if (ctx->compute_ctx.expired()) {
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
}
ctx->gc.temp_buffers.clear();
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
}
ctx->descriptor_pools.clear();
ctx->descriptor_sets.clear();
+
+ ctx->compute_cmd_pool.destroy(ctx->device->device);
+ ctx->transfer_cmd_pool.destroy(ctx->device->device);
}
static int ggml_vk_get_device_count() {
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
GGML_ASSERT(ctx->compute_ctx.expired());
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {