#include "ggml.h"
#include "pre_wgsl.hpp"
+#include <memory>
#include <string>
#include <vector>
#define GGML_WEBGPU_ARGSORT_MERGE_MAX_WG_SIZE 512u
struct ggml_webgpu_processed_shader {
- std::string wgsl;
- std::string variant;
- void * decisions;
+ std::string wgsl;
+ std::string variant;
+ std::shared_ptr<void> decisions;
};
// Same hash combine function as in boost
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_flash_attn_shader_decisions * decisions = new ggml_webgpu_flash_attn_shader_decisions();
- decisions->q_tile = q_tile;
- decisions->kv_tile = kv_tile;
- decisions->wg_size = wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
+ decisions->q_tile = q_tile;
+ decisions->kv_tile = kv_tile;
+ decisions->wg_size = wg_size;
+ result.decisions = decisions;
return result;
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
- decisions->wg_size = context.max_wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+ decisions->wg_size = context.max_wg_size;
+ result.decisions = decisions;
return result;
}
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_argsort_shader_decisions * decisions = new ggml_webgpu_argsort_shader_decisions();
- decisions->wg_size = wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_argsort_shader_decisions>();
+ decisions->wg_size = wg_size;
+ result.decisions = decisions;
return result;
}
uint32_t wg_size = std::min(GGML_WEBGPU_ARGSORT_MERGE_MAX_WG_SIZE, context.max_wg_size);
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_argsort_shader_decisions * decisions = new ggml_webgpu_argsort_shader_decisions();
- decisions->wg_size = wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_argsort_shader_decisions>();
+ decisions->wg_size = wg_size;
+ result.decisions = decisions;
return result;
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
- decisions->wg_size = context.max_wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+ decisions->wg_size = context.max_wg_size;
+ result.decisions = decisions;
return result;
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
- decisions->wg_size = context.max_wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+ decisions->wg_size = context.max_wg_size;
+ result.decisions = decisions;
return result;
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
ggml_webgpu_processed_shader result;
- result.wgsl = preprocessor.preprocess(shader_src, defines);
- result.variant = variant;
- ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
- decisions->wg_size = context.max_wg_size;
- result.decisions = decisions;
+ result.wgsl = preprocessor.preprocess(shader_src, defines);
+ result.variant = variant;
+ auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+ decisions->wg_size = context.max_wg_size;
+ result.decisions = decisions;
return result;
}
#endif // GGML_WEBGPU_SHADER_LIB_HPP
void cleanup() {
std::lock_guard<std::mutex> lock(mutex);
for (auto & bufs : free) {
- bufs.host_buf.Destroy();
- bufs.dev_buf.Destroy();
+ if (bufs.host_buf) {
+ bufs.host_buf.Destroy();
+ }
+ if (bufs.dev_buf) {
+ bufs.dev_buf.Destroy();
+ }
}
free.clear();
}
+
+ ~webgpu_buf_pool() { this->cleanup(); }
};
#ifdef GGML_WEBGPU_GPU_PROFILE
}
free.clear();
}
+
+ ~webgpu_gpu_profile_buf_pool() { this->cleanup(); }
};
#endif
struct webgpu_pipeline {
wgpu::ComputePipeline pipeline;
std::string name;
- void * context = nullptr;
+ std::shared_ptr<void> context = nullptr;
};
struct webgpu_command {
wgpu::Buffer debug_host_buf;
wgpu::Buffer debug_dev_buf;
#endif
+
+ ~webgpu_global_context_struct() {
+ if (this->get_tensor_staging_buf) {
+ this->get_tensor_staging_buf.Destroy();
+ this->get_tensor_staging_buf = nullptr;
+ }
+#ifdef GGML_WEBGPU_DEBUG
+ if (this->debug_host_buf) {
+ this->debug_host_buf.Destroy();
+ this->debug_host_buf = nullptr;
+ }
+ if (this->debug_dev_buf) {
+ this->debug_dev_buf.Destroy();
+ this->debug_dev_buf = nullptr;
+ }
+#endif
+ }
};
typedef std::shared_ptr<webgpu_global_context_struct> webgpu_global_context;
return ctx->name.c_str();
}
-// TODO: implement proper cleanup
static void ggml_backend_webgpu_free(ggml_backend_t backend) {
ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
#endif
-#if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE)
- GGML_UNUSED(ctx);
-#endif
+ delete ctx;
+ delete backend;
}
static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
ctx->pad_pipelines.emplace(pipeline_key, pipeline);
}
- ggml_webgpu_generic_shader_decisions decisions =
- *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+ auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
const uint32_t ne = (uint32_t) ggml_nelements(dst);
.size = ggml_webgpu_tensor_binding_size(ctx, dst) }
};
- uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+ uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
}
ctx->set_rows_pipelines.emplace(key, pipeline);
}
- ggml_webgpu_generic_shader_decisions decisions =
- *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+ auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
std::optional<webgpu_pool_bufs> error_bufs = std::nullopt;
if (key.i64_idx) {
} else {
threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
}
- uint32_t wg_x = CEIL_DIV(threads, decisions.wg_size);
+ uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size);
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x, 1,
error_bufs);
}
ctx->flash_attn_pipelines.emplace(key, pipeline);
}
- ggml_webgpu_flash_attn_shader_decisions decisions =
- *static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context);
+ auto * decisions = static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context.get());
- uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions.q_tile);
+ uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
uint32_t wg_x = wg_per_head * Q->ne[2] * Q->ne[3]; // wg per head * number of heads * number of batches
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
}
ctx->unary_pipelines.emplace(pipeline_key, pipeline);
}
- ggml_webgpu_generic_shader_decisions decisions =
- *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+ auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
uint32_t ne = (uint32_t) ggml_nelements(dst);
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
- uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+ uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
}
ctx->binary_pipelines.emplace(pipeline_key, pipeline);
}
- ggml_webgpu_generic_shader_decisions decisions =
- *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+ auto * decisions = static_cast<ggml_webgpu_argsort_shader_decisions *>(pipeline.context.get());
uint32_t ne = (uint32_t) ggml_nelements(dst);
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
- uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+ uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
}
argsort_pipeline.context = processed.decisions;
ctx->argsort_pipelines.emplace(order, argsort_pipeline);
}
- ggml_webgpu_argsort_shader_decisions argsort_decisions =
- *static_cast<ggml_webgpu_argsort_shader_decisions *>(argsort_pipeline.context);
+ auto * argsort_decisions = static_cast<ggml_webgpu_argsort_shader_decisions *>(argsort_pipeline.context.get());
webgpu_pipeline argsort_merge_pipeline;
it = ctx->argsort_merge_pipelines.find(order);
const uint32_t src_ne0 = (uint32_t) src->ne[0];
const uint32_t nrows = (uint32_t) ggml_nrows(src);
- const uint32_t npr = CEIL_DIV(src_ne0, argsort_decisions.wg_size);
+ const uint32_t npr = CEIL_DIV(src_ne0, argsort_decisions->wg_size);
const uint32_t block_size =
- is_top_k ? std::min(argsort_decisions.wg_size, (uint32_t) dst->ne[0]) : argsort_decisions.wg_size;
+ is_top_k ? std::min(argsort_decisions->wg_size, (uint32_t) dst->ne[0]) : argsort_decisions->wg_size;
uint32_t out_ne0 = src_ne0;
if (is_top_k) {
if (npr > 1) {
- const uint32_t last_tile = src_ne0 - (npr - 1) * argsort_decisions.wg_size;
+ const uint32_t last_tile = src_ne0 - (npr - 1) * argsort_decisions->wg_size;
out_ne0 = (npr - 1) * block_size + std::min(last_tile, block_size);
} else {
out_ne0 = block_size;
static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
- ctx->buffer.Destroy();
+ if (ctx != nullptr && ctx->buffer != nullptr) {
+ ctx->buffer.Destroy();
+ delete ctx;
+ }
}
// Returns the "fake" base pointer.
dev_desc.SetDeviceLostCallback(
wgpu::CallbackMode::AllowSpontaneous,
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+ if (reason == wgpu::DeviceLostReason::Destroyed) {
+ return;
+ }
GGML_UNUSED(device);
- GGML_UNUSED(reason);
- GGML_UNUSED(message);
- //TODO: uncomment once proper free logic is in place
- //GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
- //std::string(message).c_str());
+ GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
+ std::string(message).c_str());
});
dev_desc.SetUncapturedErrorCallback(
[](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
return ctx->device_count;
}
-// TODO: Does this need to be thread safe? Is it only called once?
-// TODO: move most logic to device_init function so backend can be freed/initialized properly
// Only one device is supported for now
-
static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");