]> git.djapps.eu Git - pkg/ggml/sources/ggml/commitdiff
Avoid unnecessarily disabling CUDA graphs (llama/7302)
authoragray3 <redacted>
Wed, 15 May 2024 13:44:49 +0000 (14:44 +0100)
committerGeorgi Gerganov <redacted>
Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
As discussed in PR #6766, CUDA graphs were being disabled in the presence of long prompts.
This fixes the issue by avoiding the consective update counter from incrementing unnecessarily
for tokens in which cuda graphs are disabled due to batch size > 1.

src/ggml-cuda.cu

index 75a2ad480877d5c361e8c4abaa8975fe97aa4ccf..04b6e52859ed56fbb4762faa97ef7a791dd9bff8 100644 (file)
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
         }
 
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
             cuda_ctx->cuda_graph->number_consecutive_updates++;
         } else {
             cuda_ctx->cuda_graph->number_consecutive_updates = 0;