]> git.djapps.eu Git - pkg/ggml/sources/ggml/commitdiff
Reduce CPU-side stalls due to the CUDA command buffer being full (llama/19042)
authorGaurav Garg <redacted>
Tue, 27 Jan 2026 06:52:44 +0000 (06:52 +0000)
committerGeorgi Gerganov <redacted>
Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
* [CUDA] Reduce CPU-side stalls due to the CUDA command buffer being full

With pipeline parallelism, during prompt processing, the CPU-side CUDA command buffer gets full, stalling the CPU. Due to this, enough work doesn't get submitted to the GPU, causing bubbles in the GPU timeline.
Fix this by setting the CUDA environment variable CUDA_SCALE_LAUNCH_QUEUES to 4x to increase the command buffer size.

* Set the env variable in the CUDA backend registry allocation

* Add link to PR in code comment

* Remove warning logs and update documentation

src/ggml-cuda/ggml-cuda.cu

index 99f0919a514786608c6fddff25f489678bf6245d..e9df0ea4a7c4398b5173fd12b4a0ece838270589 100644 (file)
@@ -4876,6 +4876,16 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
+            // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
+            // PR: https://github.com/ggml-org/llama.cpp/pull/19042
+            if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
+#ifdef _WIN32
+                _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
+#else
+                setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
+#endif // _WIN32
+            }
+
             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
             const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;