llama: fix leaked buffers for mmap + split files (#16765)

author Johannes Gäßler <redacted>

Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)

committer GitHub <redacted>

Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)
author Johannes Gäßler <redacted>
Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)
committer GitHub <redacted>
Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index b88ff51f5da1d8ee61f9719fd8f9fbf295803193..05e467180089e800226556c957605369d34e519c 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -15,7 +15,6 @@
  
  #include <algorithm>
  #include <cassert>
-#include <cmath>
  #include <cfloat>
  #include <cstring>
  #include <cmath>
@@ -438,7 +437,7 @@ struct llama_model::impl {
      llama_mlocks mlock_mmaps;
  
      // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
-    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
+    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
  
      buft_list_t cpu_buft_list;
      std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
          bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
          bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  
-        ggml_backend_buffer_t buf = nullptr;
+        std::vector<ggml_backend_buffer_ptr> bufs;
          if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
              for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      continue;
                  }
                  const size_t max_size = ggml_get_max_tensor_size(ctx);
-                buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
                  if (buf == nullptr) {
                      throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                  }
+                bufs.emplace_back(buf);
                  buf_map.emplace(idx, buf);
              }
          }
          else {
-            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
              if (buf == nullptr) {
                  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
              }
@@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  mlock_buf->init   (ggml_backend_buffer_get_base(buf));
                  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
              }
+            bufs.emplace_back(buf);
              for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                  buf_map.emplace(idx, buf);
              }
          }
-        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
+        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
  
          for (auto & buf : buf_map) {
              // indicate that this buffer contains weights
@@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
      }
  
      // print memory requirements per buffer type
-    for (auto & [_, buf] : pimpl->ctxs_bufs) {
-        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
+        for (auto & buf: bufs) {
+            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
+                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        }
      }
  
      // populate tensors_by_name
@@ -6300,8 +6304,10 @@ size_t llama_model::n_devices() const {
  
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
      std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const auto & [_, buf] : pimpl->ctxs_bufs) {
-        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
+        for (const auto & buf : bufs) {
+            ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+        }
      }
      return ret;
  }
author	Johannes Gäßler <redacted>
	Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)
committer	GitHub <redacted>
	Mon, 27 Oct 2025 08:17:31 +0000 (09:17 +0100)