size_t num_nodes = 0;
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
- bool disable_due_to_too_many_updates = false;
- int number_consecutive_updates = 0;
+ bool warmup_complete = false;
std::vector<ggml_cuda_graph_node_properties> props;
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
std::vector<ggml_cuda_graph_node_properties> extra;
- void record_update(bool use_graph, bool update_required) {
- if (use_graph && update_required) {
- number_consecutive_updates++;
- } else {
- number_consecutive_updates = 0;
- }
- if (number_consecutive_updates >= 4) {
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
- disable_due_to_too_many_updates = true;
- }
- }
-
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
- return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
+ return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
}
#endif
};
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
- if (graph->instance == nullptr) {
- res = true;
- }
-
// Check if the graph size has changed
if (graph->props.size() != (size_t)cgraph->n_nodes) {
res = true;
#ifdef USE_CUDA_GRAPH
graph_key = ggml_cuda_graph_get_key(cgraph);
- use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
+ ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->is_enabled()) {
- cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
- use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
-
- graph->record_update(use_cuda_graph, cuda_graph_update_required);
+ const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
+ if (graph_compatible) {
+ const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
+
+ if (!graph->warmup_complete) {
+ // Warmup: need at least 2 calls with no property change on the 2nd call
+ if (!properties_changed) {
+ graph->warmup_complete = true;
+ GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
+ use_cuda_graph = true;
+ cuda_graph_update_required = true;
+ }
+ // else: properties changed or first call - execute directly (use_cuda_graph stays false)
+ } else {
+ // Post-warmup: normal CUDA graph operation
+ if (properties_changed) {
+ // Properties changed - reset warmup, execute directly until stable again
+ graph->warmup_complete = false;
+ GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
+ } else {
+ use_cuda_graph = true;
+ cuda_graph_update_required = graph->instance == nullptr;
+ }
+ }
+ }
}
#endif // USE_CUDA_GRAPH