ggml : extend the GGML_SCHED_NO_REALLOC debug logic of the scheduler (llama/17617)

author Georgi Gerganov <redacted>

Mon, 1 Dec 2025 10:49:33 +0000 (12:49 +0200)

committer Georgi Gerganov <redacted>

Thu, 11 Dec 2025 13:32:51 +0000 (15:32 +0200)
author Georgi Gerganov <redacted>
Mon, 1 Dec 2025 10:49:33 +0000 (12:49 +0200)
committer Georgi Gerganov <redacted>
Thu, 11 Dec 2025 13:32:51 +0000 (15:32 +0200)
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp

index 4cf377e7f3308fcf233a33dd87ff5c7ca7d973d7..1d88c826bb1a9d8e93e4ba90b5d2c02a10b88804 100644 (file)
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -723,6 +723,12 @@ struct ggml_backend_sched {
      bool op_offload;
  
      int debug;
+
+    // used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17617
+    int debug_realloc;
+    int debug_graph_size;
+    int debug_prev_graph_size;
  };
  
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1289,6 +1295,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
      }
  
      int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
+
+    // remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
+    sched->debug_prev_graph_size = sched->debug_graph_size;
+    sched->debug_graph_size = graph_size;
+
      if (sched->graph.size < graph_size) {
          sched->graph.size = graph_size;
          sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
@@ -1395,14 +1406,21 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
  
      // allocate graph
      if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-#ifdef GGML_SCHED_NO_REALLOC
-        GGML_ABORT("%s: failed to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
-#endif
-
  #ifndef NDEBUG
          GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
  #endif
  
+        if (sched->debug_realloc > 0) {
+            // we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
+            // example: https://github.com/ggml-org/llama.cpp/pull/17143
+            const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
+
+            if (unexpected || sched->debug_realloc > 1) {
+                GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
+                        sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
+            }
+        }
+
          // the re-allocation may cause the split inputs to be moved to a different address
          // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
          for (int i = 0; i < sched->n_backends; i++) {
@@ -1620,6 +1638,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
  
      const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
      sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
+
+    sched->debug_realloc = 0;
+#ifdef GGML_SCHED_NO_REALLOC
+    sched->debug_realloc = 1;
+#endif
+    const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
+    sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
+
      sched->n_backends = n_backends;
      sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
  
@@ -1636,6 +1662,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
      sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
      sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
  
+    sched->debug_graph_size = 0;
+    sched->debug_prev_graph_size = 0;
+
      sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
      sched->context_buffer = (char *) malloc(sched->context_buffer_size);
author	Georgi Gerganov <redacted>
	Mon, 1 Dec 2025 10:49:33 +0000 (12:49 +0200)
committer	Georgi Gerganov <redacted>
	Thu, 11 Dec 2025 13:32:51 +0000 (15:32 +0200)