Avoid unnecessarily disabling CUDA graphs (llama/7302)

author agray3 <redacted>

Wed, 15 May 2024 13:44:49 +0000 (14:44 +0100)

committer Georgi Gerganov <redacted>

Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
author agray3 <redacted>
Wed, 15 May 2024 13:44:49 +0000 (14:44 +0100)
committer Georgi Gerganov <redacted>
Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu

index 75a2ad480877d5c361e8c4abaa8975fe97aa4ccf..04b6e52859ed56fbb4762faa97ef7a791dd9bff8 100644 (file)
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
          }
  
          // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
              cuda_ctx->cuda_graph->number_consecutive_updates++;
          } else {
              cuda_ctx->cuda_graph->number_consecutive_updates = 0;
author	agray3 <redacted>
	Wed, 15 May 2024 13:44:49 +0000 (14:44 +0100)
committer	Georgi Gerganov <redacted>
	Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)