From: Gaurav Garg Date: Tue, 3 Feb 2026 06:41:02 +0000 (+0530) Subject: cuda : revert CUDA_SCALE_LAUNCH_QUEUES override until investigated (llama/19227) X-Git-Tag: v0.9.7~64 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=8c2086e7fe98fd222835c12f41358a1f7715f6d3;p=pkg%2Fggml%2Fsources%2Fggml cuda : revert CUDA_SCALE_LAUNCH_QUEUES override until investigated (llama/19227) Hangs were reported on Jetson Orin AGX if we set CUDA_SCALE_LAUNCH_QUEUES=4x. Reverting the previous PR (#19042) and updating the document to consider setting CUDA_SCALE_LAUNCH_QUEUES=4x for faster throughput on multi-GPU systems. --- diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu index 08383edb..1bcd1ab1 100644 --- a/src/ggml-cuda/ggml-cuda.cu +++ b/src/ggml-cuda/ggml-cuda.cu @@ -5049,16 +5049,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { - // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance - // PR: https://github.com/ggml-org/llama.cpp/pull/19042 - if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) { -#ifdef _WIN32 - _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x"); -#else - setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set -#endif // _WIN32 - } - ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;