llama: automatically set parameters not set by the user in such a way that maximizes...

author Johannes Gäßler <redacted>

Mon, 15 Dec 2025 08:24:59 +0000 (09:24 +0100)

committer Georgi Gerganov <redacted>

Wed, 17 Dec 2025 11:55:04 +0000 (13:55 +0200)
author Johannes Gäßler <redacted>
Mon, 15 Dec 2025 08:24:59 +0000 (09:24 +0100)
committer Georgi Gerganov <redacted>
Wed, 17 Dec 2025 11:55:04 +0000 (13:55 +0200)
diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h

index 2cb150fd2a313487c41112cd134fb4a5aa5f87cc..78aa059dde380ac6a88355de164b7fa377b7749d 100644 (file)
--- a/include/ggml-alloc.h
+++ b/include/ggml-alloc.h
@@ -53,7 +53,14 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
  // call with a worst-case graph to avoid buffer reallocations
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
  // returns false if the buffer allocation failed
+// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API void ggml_gallocr_reserve_n_size(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids,
+    size_t * sizes);
  GGML_API bool ggml_gallocr_reserve_n(
      ggml_gallocr_t galloc,
      struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
  
  // Utils
  // Create a buffer and allocate all the tensors in a ggml_context
+// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
+GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
  
diff --git a/include/ggml-backend.h b/include/ggml-backend.h

index f1b740785914ed3577c6517937af0a34d4bcaaec..4ed5f35774ffcbb462cab7d06e28d3ff7d2a49b7 100644 (file)
--- a/include/ggml-backend.h
+++ b/include/ggml-backend.h
@@ -307,6 +307,7 @@ extern "C" {
      GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
  
      // Initialize backend buffers from a measure graph
+    GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
      GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
  
      GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
diff --git a/include/ggml.h b/include/ggml.h

index 686da3dbd107835c9454532da9ee1ddd1ea9f5c8..20c912d0e9bbe888062dd16853d872ea173e1492 100644 (file)
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -2615,7 +2615,8 @@ extern "C" {
  
      // Set callback for all future logging events.
      // If this is not called, or NULL is supplied, everything is output on stderr.
-    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
+    GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
+    GGML_API void ggml_log_set(ggml_log_callback   log_callback, void *  user_data);
  
      GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
  
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c

index ec16cbda9ff08df09998253309a2b7fb10d2460d..41419b617bdf76310d745e4ef36d9f000b82c72d 100644 (file)
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  }
  
  static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
+    return t->data != NULL // tensor data already set externally
+        || t->buffer // tensor on external buffer (but not yet allocated)
+        || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
  }
  
  // free the extra space at the end if the new tensor is smaller
@@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
      }
  }
  
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+static bool ggml_gallocr_reserve_n_impl(
+        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
      size_t min_hash_size = graph->n_nodes + graph->n_leafs;
      // add 25% margin to avoid hash collisions
      min_hash_size += min_hash_size / 4;
@@ -928,16 +931,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
                  size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
                  if (cur_size > 0) {
                      GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
-                        __func__, ggml_backend_buft_name(galloc->bufts[i]),
-                        cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+                        __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
                  }
              }
  #endif
              ggml_vbuffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-            if (galloc->buffers[i] == NULL) {
-                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+            if (no_alloc) {
+                galloc->buffers[i] = NULL;
+            } else {
+                galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+                if (galloc->buffers[i] == NULL) {
+                    GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                    return false;
+                }
              }
          }
      }
@@ -945,6 +951,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
      return true;
  }
  
+void ggml_gallocr_reserve_n_size(
+        ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
+    GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        sizes[i] = 0;
+        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
+            sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
+        }
+    }
+}
+
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
+}
+
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
      return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
  }
@@ -1147,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
      return true;
  }
  
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
+        struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
      GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
  
      size_t alignment = ggml_backend_buft_get_alignment(buft);
@@ -1155,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
  
      ggml_backend_buffer_t * buffers = NULL;
      size_t n_buffers = 0;
+    *nbytes_total = 0;
  
      size_t cur_buf_size = 0;
      struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
  
          if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
              // allocate tensors in the current buffer
-            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+            if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                  return NULL;
              }
              first = t;
+            *nbytes_total += cur_buf_size;
              cur_buf_size = this_size;
          } else {
              cur_buf_size += this_size;
@@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
  
      // allocate remaining tensors
      if (cur_buf_size > 0) {
-        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+        *nbytes_total += cur_buf_size;
+        if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
              return NULL;
          }
      }
  
+    if (no_alloc) {
+        return NULL;
+    }
+
      if (n_buffers == 0) {
  #ifndef NDEBUG
          GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
  #endif
+        GGML_ASSERT(!buffers);
          return NULL;
      }
  
@@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
      } else {
          buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
      }
-    free(buffers);
+    if (buffers) {
+        free(buffers); // can be NULL if context is empty or no_alloc
+    }
      return buffer;
  }
  
+size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
+    GGML_ASSERT(!buf);
+    return nbytes_total;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
+}
+
  ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
      return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
  }
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp

index 08681f35e3f96da2597612a57f4402ecee2926f9..8547ecc849c6fea7c013a332c3fac9ef25bdb4f7 100644 (file)
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
  }
  
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    GGML_ASSERT(buft);
      if (size == 0) {
          // return a dummy buffer for zero-sized allocations
          return ggml_backend_buffer_init(buft, {}, NULL, 0);
      }
-
-    GGML_ASSERT(buft);
      return buft->iface.alloc_buffer(buft, size);
  }
  
@@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
          return NULL;
      }
  
+    // FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
+    //     I don't know whether the above comment is correct
+    if (!buffer->iface.get_base) {
+        return NULL;
+    }
+
      void * base = buffer->iface.get_base(buffer);
  
      GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
      sched->is_alloc = false;
  }
  
+void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+    GGML_ASSERT(sizes);
+
+    ggml_backend_sched_reset(sched);
+
+    ggml_backend_sched_synchronize(sched);
+
+    ggml_backend_sched_split_graph(sched, measure_graph);
+
+    ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
+}
+
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
      GGML_ASSERT(sched);
      GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
diff --git a/src/ggml.c b/src/ggml.c

index f0913cd35967f43e19ca9fe7a9d95ed5a2d3d550..eb3ae72eaacbf0108e43b983c8f8b0170d0aaf8b 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -7566,6 +7566,11 @@ size_t ggml_quantize_chunk(
  
  ////////////////////////////////////////////////////////////////////////////////
  
+void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
+    *log_callback = g_logger_state.log_callback;
+    *user_data    = g_logger_state.log_callback_user_data;
+}
+
  void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
      g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
      g_logger_state.log_callback_user_data = user_data;
author	Johannes Gäßler <redacted>
	Mon, 15 Dec 2025 08:24:59 +0000 (09:24 +0100)
committer	Georgi Gerganov <redacted>
	Wed, 17 Dec 2025 11:55:04 +0000 (13:55 +0200)
include/ggml-alloc.h		patch \| blob \| history
include/ggml-backend.h		patch \| blob \| history
include/ggml.h		patch \| blob \| history
src/ggml-alloc.c		patch \| blob \| history
src/ggml-backend.cpp		patch \| blob \| history
src/ggml.c		patch \| blob \| history