]> git.djapps.eu Git - pkg/ggml/sources/ggml/commitdiff
cuda : revert CUDA_SCALE_LAUNCH_QUEUES override until investigated (llama/19227)
authorGaurav Garg <redacted>
Tue, 3 Feb 2026 06:41:02 +0000 (12:11 +0530)
committerGeorgi Gerganov <redacted>
Sat, 7 Feb 2026 08:37:38 +0000 (10:37 +0200)
Hangs were reported on Jetson Orin AGX if we set CUDA_SCALE_LAUNCH_QUEUES=4x. Reverting the previous PR (#19042) and updating the document to consider setting CUDA_SCALE_LAUNCH_QUEUES=4x for faster throughput on multi-GPU systems.

src/ggml-cuda/ggml-cuda.cu

index 08383edb40290c5bb3f6d4c35b8c60a15a770f54..1bcd1ab1f8f0eed73017fb9eca1c195ceaea0191 100644 (file)
@@ -5049,16 +5049,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
-            // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
-            // PR: https://github.com/ggml-org/llama.cpp/pull/19042
-            if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
-#ifdef _WIN32
-                _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
-#else
-                setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
-#endif // _WIN32
-            }
-
             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
             const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;