From: Johannes Gäßler <redacted>
Date: Mon, 29 Dec 2025 16:58:20 +0000 (+0100)
Subject: CUDA: fix replacment of bad archs in CMake (#18457)
X-Git-Tag: upstream/0.0.7599~24
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=0bd1212a43561a55d89d468b7ca5265647dab1fd;p=pkg%2Fggml%2Fsources%2Fllama.cpp

CUDA: fix replacment of bad archs in CMake (#18457)
---

diff --git a/docs/build.md b/docs/build.md
index 4a6911778..63fd8b4fc 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -150,19 +150,38 @@ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in
 
 
 ### Compilation
+
+Make sure to read the notes about the CPU build for general instructions for e.g. speeding up the compilation.
+
 ```bash
 cmake -B build -DGGML_CUDA=ON
 cmake --build build --config Release
 ```
 
+### Non-Native Builds
+
+By default llama.cpp will be built for the hardware that is connected to the system at that time.
+For a build covering all CUDA GPUs, disable `GGML_NATIVE`:
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=OFF
+```
+
+The resulting binary should run on all CUDA GPUs with optimal performance, though some just-in-time compilation may be required.
+
 ### Override Compute Capability Specifications
 
-If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
+If `nvcc` cannot detect your gpu, you may get compile warnings such as:
  ```text
 nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
 ```
 
-To override the `native` GPU detection:
+One option is to do a non-native build as described above.
+However, this will result in a large binary that takes a long time to compile.
+Alternatively it is also possible to explicitly specify CUDA architectures.
+This may also make sense for a non-native build, for that one should look at the logic in `ggml/src/ggml-cuda/CMakeLists.txt` as a starting point.
+
+To override the default CUDA architectures:
 
 #### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
 
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 73f2910b7..ae8f963f6 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -51,35 +51,35 @@ if (CUDAToolkit_FOUND)
             endif()
         endif()
     endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
     enable_language(CUDA)
 
-    # Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x
-    if (GGML_NATIVE)
-        set(PROCESSED_ARCHITECTURES "")
-        if (CMAKE_CUDA_ARCHITECTURES_NATIVE)
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
-        else()
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES})
-        endif()
-        foreach(ARCH ${ARCH_LIST})
+    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
+    # 12X is forwards-compatible, 12Xa is not.
+    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
+    # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
+    # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
+    foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
+        set(FIXED_ARCHS "")
+        foreach(ARCH IN LISTS ${ARCHS})
             if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH})
-                message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real")
-                list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real")
+                string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
+                message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
+                list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
             else()
-                list(APPEND PROCESSED_ARCHITECTURES ${ARCH})
-            endif()
-        endforeach()
-        set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES})
-    else()
-        foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES})
-            if(ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell-specific optimizations")
+                list(APPEND FIXED_ARCHS "${ARCH}")
             endif()
         endforeach()
+        set(${ARCHS} ${FIXED_ARCHS})
+    endforeach()
+
+    # If we try to compile a "native" build it will use the 12X architectures and fail.
+    # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
+    # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
+    if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
+        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
     endif()
+    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
 
     file(GLOB   GGML_HEADERS_CUDA "*.cuh")
     list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")