ggml-alloc : fix leak when reusing a tensor with a larger size (llama/16679)

author Diego Devesa <redacted>

Mon, 20 Oct 2025 12:53:50 +0000 (05:53 -0700)

committer Georgi Gerganov <redacted>

Tue, 21 Oct 2025 15:14:33 +0000 (18:14 +0300)
author Diego Devesa <redacted>
Mon, 20 Oct 2025 12:53:50 +0000 (05:53 -0700)
committer Georgi Gerganov <redacted>
Tue, 21 Oct 2025 15:14:33 +0000 (18:14 +0300)
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c

index 929bc4488156f45627dc4371e74a30eb9d84fa01..c830c09655fec68412154a643a89abf95ea34364 100644 (file)
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -598,6 +598,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
      return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
  }
  
+// free the extra space at the end if the new tensor is smaller
+static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+
+    size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
+    size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+
+    GGML_ASSERT(parent_size >= node_size);
+
+    if (parent_size > node_size) {
+        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+        struct buffer_address p_addr = p_hn->addr;
+        p_addr.offset += node_size;
+        size_t extra_size = parent_size - node_size;
+        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
+        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+    }
+}
+
  static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
      GGML_ASSERT(buffer_id >= 0);
      struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
@@ -643,6 +663,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                              hn->addr = p_hn->addr;
                              p_hn->allocated = false; // avoid freeing the parent
                              view_src_hn->allocated = false;
+                            ggml_gallocr_free_extra_space(galloc, node, view_src);
                              return;
                          }
                      } else {
@@ -650,6 +671,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                          hn->buffer_id = p_hn->buffer_id;
                          hn->addr = p_hn->addr;
                          p_hn->allocated = false; // avoid freeing the parent
+                        ggml_gallocr_free_extra_space(galloc, node, parent);
                          return;
                      }
                  }
author	Diego Devesa <redacted>
	Mon, 20 Oct 2025 12:53:50 +0000 (05:53 -0700)
committer	Georgi Gerganov <redacted>
	Tue, 21 Oct 2025 15:14:33 +0000 (18:14 +0300)