size_t n;
};
+struct vk_staging_memset {
+ vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {}
+
+ void * dst;
+ uint32_t val;
+ size_t n;
+};
+
struct vk_context_struct {
vk_submission * s;
std::vector<vk_sequence> seqs;
std::vector<vk_staging_memcpy> in_memcpys;
std::vector<vk_staging_memcpy> out_memcpys;
+ std::vector<vk_staging_memset> memsets;
vk_command_pool * p {};
};
}
}
+static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr) {
+ if (memsets == nullptr) {
+ memset(dst, val, size);
+ } else {
+ memsets->emplace_back(dst, val, size);
+ }
+}
+
static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
if (device->sync_staging == nullptr || device->sync_staging->size < size) {
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
memcpy(cpy.dst, cpy.src, cpy.n);
}
+ for (auto& mset : subctx->memsets) {
+ memset(mset.dst, mset.val, mset.n);
+ }
+
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
+ if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
+ dst->device->uma) {
+ deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
+ return;
+ }
+
+ // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
}
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
+ if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
+ dst->device->uma) {
+ memset((uint8_t*)dst->ptr + offset, c, size);
+ return;
+ }
+
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
memcpy(cpy.dst, cpy.src, cpy.n);
}
+ for (auto& mset : subctx->memsets) {
+ memset(mset.dst, mset.val, mset.n);
+ }
+
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
ggml_vk_submit(subctx, ctx->almost_ready_fence);
ctx->almost_ready_fence_pending = true;
}
subctx->in_memcpys.clear();
subctx->out_memcpys.clear();
+ subctx->memsets.clear();
}
return true;