Improve CUDA graph capture (llama/19754)

author Gaurav Garg <redacted>

Sat, 21 Feb 2026 09:39:36 +0000 (15:09 +0530)

committer Georgi Gerganov <redacted>

Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
author Gaurav Garg <redacted>
Sat, 21 Feb 2026 09:39:36 +0000 (15:09 +0530)
committer Georgi Gerganov <redacted>
Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh

index a3256d59dd06319a6397ce6562293b38e31e4f34..36d8a3aaab29e2bddd556e1a9acadedd110692bb 100644 (file)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1149,8 +1149,7 @@ struct ggml_cuda_graph {
      size_t num_nodes = 0;
      std::vector<cudaGraphNode_t> nodes;
      bool disable_due_to_gpu_arch = false;
-    bool disable_due_to_too_many_updates = false;
-    int number_consecutive_updates = 0;
+    bool warmup_complete = false;
      std::vector<ggml_cuda_graph_node_properties> props;
  
      // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@@ -1159,21 +1158,9 @@ struct ggml_cuda_graph {
      // ref: https://github.com/ggml-org/llama.cpp/pull/19165
      std::vector<ggml_cuda_graph_node_properties> extra;
  
-    void record_update(bool use_graph, bool update_required) {
-        if (use_graph && update_required) {
-            number_consecutive_updates++;
-        } else {
-            number_consecutive_updates = 0;
-        }
-        if (number_consecutive_updates >= 4) {
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-            disable_due_to_too_many_updates = true;
-        }
-    }
-
      bool is_enabled() const {
          static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
+        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
      }
  #endif
  };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index ffa35eeb654c52c062ce24a0e9b9e6b0130834c9..7e6d3303549a558a10c6408636f251c828cc2dc2 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
      const void * graph_key = ggml_cuda_graph_get_key(cgraph);
      ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
  
-    if (graph->instance == nullptr) {
-        res = true;
-    }
-
      // Check if the graph size has changed
      if (graph->props.size() != (size_t)cgraph->n_nodes) {
          res = true;
@@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
  #ifdef USE_CUDA_GRAPH
      graph_key = ggml_cuda_graph_get_key(cgraph);
  
-    use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
+    ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
  
      ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
      if (graph->is_enabled()) {
-        cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
-        use_cuda_graph             = ggml_cuda_graph_check_compability(cgraph);
-
-        graph->record_update(use_cuda_graph, cuda_graph_update_required);
+        const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
+        if (graph_compatible) {
+            const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
+
+            if (!graph->warmup_complete) {
+                // Warmup: need at least 2 calls with no property change on the 2nd call
+                if (!properties_changed) {
+                    graph->warmup_complete = true;
+                    GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
+                    use_cuda_graph = true;
+                    cuda_graph_update_required = true;
+                }
+                // else: properties changed or first call - execute directly (use_cuda_graph stays false)
+            } else {
+                // Post-warmup: normal CUDA graph operation
+                if (properties_changed) {
+                    // Properties changed - reset warmup, execute directly until stable again
+                    graph->warmup_complete = false;
+                    GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
+                } else {
+                    use_cuda_graph = true;
+                    cuda_graph_update_required = graph->instance == nullptr;
+                }
+            }
+        }
      }
  #endif // USE_CUDA_GRAPH
author	Gaurav Garg <redacted>
	Sat, 21 Feb 2026 09:39:36 +0000 (15:09 +0530)
committer	Georgi Gerganov <redacted>
	Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
ggml/src/ggml-cuda/common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history