Plug memory leaks and free resources on shutdown (llama/19315)

author Nikhil Jain <redacted>

Tue, 10 Feb 2026 16:04:00 +0000 (08:04 -0800)

committer Georgi Gerganov <redacted>

Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
author Nikhil Jain <redacted>
Tue, 10 Feb 2026 16:04:00 +0000 (08:04 -0800)
committer Georgi Gerganov <redacted>
Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
diff --git a/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp

index 6997f6bdd31d97b414b105d31850c210d0975d78..63f797f142d0c14c1e40d0403d042665647bb188 100644 (file)
--- a/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -4,6 +4,7 @@
  #include "ggml.h"
  #include "pre_wgsl.hpp"
  
+#include <memory>
  #include <string>
  #include <vector>
  
@@ -18,9 +19,9 @@
  #define GGML_WEBGPU_ARGSORT_MERGE_MAX_WG_SIZE 512u
  
  struct ggml_webgpu_processed_shader {
-    std::string wgsl;
-    std::string variant;
-    void *      decisions;
+    std::string           wgsl;
+    std::string           variant;
+    std::shared_ptr<void> decisions;
  };
  
  // Same hash combine function as in boost
@@ -192,13 +193,13 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_shader(
      defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
  
      ggml_webgpu_processed_shader result;
-    result.wgsl                                         = preprocessor.preprocess(shader_src, defines);
-    result.variant                                      = variant;
-    ggml_webgpu_flash_attn_shader_decisions * decisions = new ggml_webgpu_flash_attn_shader_decisions();
-    decisions->q_tile                                   = q_tile;
-    decisions->kv_tile                                  = kv_tile;
-    decisions->wg_size                                  = wg_size;
-    result.decisions                                    = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
+    decisions->q_tile  = q_tile;
+    decisions->kv_tile = kv_tile;
+    decisions->wg_size = wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -270,11 +271,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_pad_shader(
      defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
  
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
-    decisions->wg_size                               = context.max_wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+    decisions->wg_size = context.max_wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -305,11 +306,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_argsort_shader(
      }
      defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_argsort_shader_decisions * decisions = new ggml_webgpu_argsort_shader_decisions();
-    decisions->wg_size                               = wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_argsort_shader_decisions>();
+    decisions->wg_size = wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -324,11 +325,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_argsort_merge_shader(
      uint32_t wg_size = std::min(GGML_WEBGPU_ARGSORT_MERGE_MAX_WG_SIZE, context.max_wg_size);
      defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_argsort_shader_decisions * decisions = new ggml_webgpu_argsort_shader_decisions();
-    decisions->wg_size                               = wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_argsort_shader_decisions>();
+    decisions->wg_size = wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -391,11 +392,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_set_rows_shader(
      defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
  
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
-    decisions->wg_size                               = context.max_wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+    decisions->wg_size = context.max_wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -457,11 +458,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_unary_shader(
      defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
  
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
-    decisions->wg_size                               = context.max_wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+    decisions->wg_size = context.max_wg_size;
+    result.decisions   = decisions;
      return result;
  }
  
@@ -527,11 +528,11 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_binary_shader(
  
      defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
      ggml_webgpu_processed_shader result;
-    result.wgsl                                      = preprocessor.preprocess(shader_src, defines);
-    result.variant                                   = variant;
-    ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
-    decisions->wg_size                               = context.max_wg_size;
-    result.decisions                                 = decisions;
+    result.wgsl        = preprocessor.preprocess(shader_src, defines);
+    result.variant     = variant;
+    auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+    decisions->wg_size = context.max_wg_size;
+    result.decisions   = decisions;
      return result;
  }
  #endif  // GGML_WEBGPU_SHADER_LIB_HPP
diff --git a/src/ggml-webgpu/ggml-webgpu.cpp b/src/ggml-webgpu/ggml-webgpu.cpp

index f7ceca11212a0056bb4891ff0c231b3c2c71381b..32e120266a9bb477ee8a0979694eda729260dbd6 100644 (file)
--- a/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/src/ggml-webgpu/ggml-webgpu.cpp
@@ -186,11 +186,17 @@ struct webgpu_buf_pool {
      void cleanup() {
          std::lock_guard<std::mutex> lock(mutex);
          for (auto & bufs : free) {
-            bufs.host_buf.Destroy();
-            bufs.dev_buf.Destroy();
+            if (bufs.host_buf) {
+                bufs.host_buf.Destroy();
+            }
+            if (bufs.dev_buf) {
+                bufs.dev_buf.Destroy();
+            }
          }
          free.clear();
      }
+
+    ~webgpu_buf_pool() { this->cleanup(); }
  };
  
  #ifdef GGML_WEBGPU_GPU_PROFILE
@@ -252,13 +258,15 @@ struct webgpu_gpu_profile_buf_pool {
          }
          free.clear();
      }
+
+    ~webgpu_gpu_profile_buf_pool() { this->cleanup(); }
  };
  #endif
  
  struct webgpu_pipeline {
      wgpu::ComputePipeline pipeline;
      std::string           name;
-    void *                context = nullptr;
+    std::shared_ptr<void> context = nullptr;
  };
  
  struct webgpu_command {
@@ -319,6 +327,23 @@ struct webgpu_global_context_struct {
      wgpu::Buffer debug_host_buf;
      wgpu::Buffer debug_dev_buf;
  #endif
+
+    ~webgpu_global_context_struct() {
+        if (this->get_tensor_staging_buf) {
+            this->get_tensor_staging_buf.Destroy();
+            this->get_tensor_staging_buf = nullptr;
+        }
+#ifdef GGML_WEBGPU_DEBUG
+        if (this->debug_host_buf) {
+            this->debug_host_buf.Destroy();
+            this->debug_host_buf = nullptr;
+        }
+        if (this->debug_dev_buf) {
+            this->debug_dev_buf.Destroy();
+            this->debug_dev_buf = nullptr;
+        }
+#endif
+    }
  };
  
  typedef std::shared_ptr<webgpu_global_context_struct> webgpu_global_context;
@@ -744,7 +769,6 @@ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
      return ctx->name.c_str();
  }
  
-// TODO: implement proper cleanup
  static void ggml_backend_webgpu_free(ggml_backend_t backend) {
      ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
      WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
@@ -788,9 +812,8 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
      std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
  #endif
  
-#if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE)
-    GGML_UNUSED(ctx);
-#endif
+    delete ctx;
+    delete backend;
  }
  
  static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
@@ -896,8 +919,7 @@ static webgpu_command ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, g
          ctx->pad_pipelines.emplace(pipeline_key, pipeline);
      }
  
-    ggml_webgpu_generic_shader_decisions decisions =
-        *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
  
      const uint32_t ne = (uint32_t) ggml_nelements(dst);
  
@@ -941,7 +963,7 @@ static webgpu_command ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, g
           .size    = ggml_webgpu_tensor_binding_size(ctx, dst) }
      };
  
-    uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
      return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
  }
  
@@ -975,8 +997,7 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
          ctx->set_rows_pipelines.emplace(key, pipeline);
      }
  
-    ggml_webgpu_generic_shader_decisions decisions =
-        *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
  
      std::optional<webgpu_pool_bufs> error_bufs = std::nullopt;
      if (key.i64_idx) {
@@ -1028,7 +1049,7 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
      } else {
          threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
      }
-    uint32_t wg_x = CEIL_DIV(threads, decisions.wg_size);
+    uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size);
      return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x, 1,
                                       error_bufs);
  }
@@ -1297,10 +1318,9 @@ static webgpu_command ggml_webgpu_flash_attn(webgpu_context & ctx,
          ctx->flash_attn_pipelines.emplace(key, pipeline);
      }
  
-    ggml_webgpu_flash_attn_shader_decisions decisions =
-        *static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context);
+    auto * decisions = static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context.get());
  
-    uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions.q_tile);
+    uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
      uint32_t wg_x        = wg_per_head * Q->ne[2] * Q->ne[3];  // wg per head * number of heads * number of batches
      return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
  }
@@ -1331,8 +1351,7 @@ static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * s
          ctx->unary_pipelines.emplace(pipeline_key, pipeline);
      }
  
-    ggml_webgpu_generic_shader_decisions decisions =
-        *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
  
      uint32_t ne = (uint32_t) ggml_nelements(dst);
  
@@ -1392,7 +1411,7 @@ static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * s
                              .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
      }
  
-    uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
      return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
  }
  
@@ -1425,8 +1444,7 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
          ctx->binary_pipelines.emplace(pipeline_key, pipeline);
      }
  
-    ggml_webgpu_generic_shader_decisions decisions =
-        *static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context);
+    auto * decisions = static_cast<ggml_webgpu_argsort_shader_decisions *>(pipeline.context.get());
  
      uint32_t ne = (uint32_t) ggml_nelements(dst);
  
@@ -1471,7 +1489,7 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
                              .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
      }
  
-    uint32_t wg_x = CEIL_DIV(ne, decisions.wg_size);
+    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
      return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
  }
  
@@ -1821,8 +1839,7 @@ static webgpu_command ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * sr
          argsort_pipeline.context = processed.decisions;
          ctx->argsort_pipelines.emplace(order, argsort_pipeline);
      }
-    ggml_webgpu_argsort_shader_decisions argsort_decisions =
-        *static_cast<ggml_webgpu_argsort_shader_decisions *>(argsort_pipeline.context);
+    auto * argsort_decisions = static_cast<ggml_webgpu_argsort_shader_decisions *>(argsort_pipeline.context.get());
  
      webgpu_pipeline argsort_merge_pipeline;
      it = ctx->argsort_merge_pipelines.find(order);
@@ -1839,13 +1856,13 @@ static webgpu_command ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * sr
  
      const uint32_t src_ne0 = (uint32_t) src->ne[0];
      const uint32_t nrows   = (uint32_t) ggml_nrows(src);
-    const uint32_t npr     = CEIL_DIV(src_ne0, argsort_decisions.wg_size);
+    const uint32_t npr     = CEIL_DIV(src_ne0, argsort_decisions->wg_size);
      const uint32_t block_size =
-        is_top_k ? std::min(argsort_decisions.wg_size, (uint32_t) dst->ne[0]) : argsort_decisions.wg_size;
+        is_top_k ? std::min(argsort_decisions->wg_size, (uint32_t) dst->ne[0]) : argsort_decisions->wg_size;
      uint32_t out_ne0 = src_ne0;
      if (is_top_k) {
          if (npr > 1) {
-            const uint32_t last_tile = src_ne0 - (npr - 1) * argsort_decisions.wg_size;
+            const uint32_t last_tile = src_ne0 - (npr - 1) * argsort_decisions->wg_size;
              out_ne0                  = (npr - 1) * block_size + std::min(last_tile, block_size);
          } else {
              out_ne0 = block_size;
@@ -2198,7 +2215,10 @@ static ggml_backend_i ggml_backend_webgpu_i = {
  
  static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
-    ctx->buffer.Destroy();
+    if (ctx != nullptr && ctx->buffer != nullptr) {
+        ctx->buffer.Destroy();
+        delete ctx;
+    }
  }
  
  // Returns the "fake" base pointer.
@@ -2926,12 +2946,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
      dev_desc.SetDeviceLostCallback(
          wgpu::CallbackMode::AllowSpontaneous,
          [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+            if (reason == wgpu::DeviceLostReason::Destroyed) {
+                return;
+            }
              GGML_UNUSED(device);
-            GGML_UNUSED(reason);
-            GGML_UNUSED(message);
-            //TODO: uncomment once proper free logic is in place
-            //GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
-            //std::string(message).c_str());
+            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
+                           std::string(message).c_str());
          });
      dev_desc.SetUncapturedErrorCallback(
          [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
@@ -3365,10 +3385,7 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
      return ctx->device_count;
  }
  
-// TODO: Does this need to be thread safe? Is it only called once?
-// TODO: move most logic to device_init function so backend can be freed/initialized properly
  // Only one device is supported for now
-
  static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
      GGML_ASSERT(index == 0);
      WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
author	Nikhil Jain <redacted>
	Tue, 10 Feb 2026 16:04:00 +0000 (08:04 -0800)
committer	Georgi Gerganov <redacted>
	Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
src/ggml-webgpu/ggml-webgpu-shader-lib.hpp		patch \| blob \| history
src/ggml-webgpu/ggml-webgpu.cpp		patch \| blob \| history