Reduce CPU-side stalls due to the CUDA command buffer being full (llama/19042)

author Gaurav Garg <redacted>

Tue, 27 Jan 2026 06:52:44 +0000 (06:52 +0000)

committer Georgi Gerganov <redacted>

Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
author Gaurav Garg <redacted>
Tue, 27 Jan 2026 06:52:44 +0000 (06:52 +0000)
committer Georgi Gerganov <redacted>
Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index 99f0919a514786608c6fddff25f489678bf6245d..e9df0ea4a7c4398b5173fd12b4a0ece838270589 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -4876,6 +4876,16 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
          static std::mutex mutex;
          std::lock_guard<std::mutex> lock(mutex);
          if (!initialized) {
+            // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
+            // PR: https://github.com/ggml-org/llama.cpp/pull/19042
+            if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
+#ifdef _WIN32
+                _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
+#else
+                setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
+#endif // _WIN32
+            }
+
              ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
              const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
author	Gaurav Garg <redacted>
	Tue, 27 Jan 2026 06:52:44 +0000 (06:52 +0000)
committer	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)