CUDA: fix replacment of bad archs in CMake (#18457)

author Johannes Gäßler <redacted>

Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)

committer GitHub <redacted>

Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
author Johannes Gäßler <redacted>
Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
committer GitHub <redacted>
Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
diff --git a/docs/build.md b/docs/build.md

index 4a6911778c91f595ec4ffee6e24586e4b8f4b316..63fd8b4fcd045020a25277d0503205b97686a590 100644 (file)
--- a/docs/build.md
+++ b/docs/build.md
@@ -150,19 +150,38 @@ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in
  
  
  ### Compilation
+
+Make sure to read the notes about the CPU build for general instructions for e.g. speeding up the compilation.
+
  ```bash
  cmake -B build -DGGML_CUDA=ON
  cmake --build build --config Release
  ```
  
+### Non-Native Builds
+
+By default llama.cpp will be built for the hardware that is connected to the system at that time.
+For a build covering all CUDA GPUs, disable `GGML_NATIVE`:
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=OFF
+```
+
+The resulting binary should run on all CUDA GPUs with optimal performance, though some just-in-time compilation may be required.
+
  ### Override Compute Capability Specifications
  
-If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
+If `nvcc` cannot detect your gpu, you may get compile warnings such as:
   ```text
  nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
  ```
  
-To override the `native` GPU detection:
+One option is to do a non-native build as described above.
+However, this will result in a large binary that takes a long time to compile.
+Alternatively it is also possible to explicitly specify CUDA architectures.
+This may also make sense for a non-native build, for that one should look at the logic in `ggml/src/ggml-cuda/CMakeLists.txt` as a starting point.
+
+To override the default CUDA architectures:
  
  #### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
  
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt

index 73f2910b7c56cf81c43abf7a8653de497e282798..ae8f963f6991fc9d24546451b753e1f39a832ac2 100644 (file)
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -51,35 +51,35 @@ if (CUDAToolkit_FOUND)
              endif()
          endif()
      endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
  
      enable_language(CUDA)
  
-    # Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x
-    if (GGML_NATIVE)
-        set(PROCESSED_ARCHITECTURES "")
-        if (CMAKE_CUDA_ARCHITECTURES_NATIVE)
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
-        else()
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES})
-        endif()
-        foreach(ARCH ${ARCH_LIST})
+    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
+    # 12X is forwards-compatible, 12Xa is not.
+    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
+    # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
+    # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
+    foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
+        set(FIXED_ARCHS "")
+        foreach(ARCH IN LISTS ${ARCHS})
              if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH})
-                message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real")
-                list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real")
+                string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
+                message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
+                list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
              else()
-                list(APPEND PROCESSED_ARCHITECTURES ${ARCH})
-            endif()
-        endforeach()
-        set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES})
-    else()
-        foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES})
-            if(ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell-specific optimizations")
+                list(APPEND FIXED_ARCHS "${ARCH}")
              endif()
          endforeach()
+        set(${ARCHS} ${FIXED_ARCHS})
+    endforeach()
+
+    # If we try to compile a "native" build it will use the 12X architectures and fail.
+    # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
+    # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
+    if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
+        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
      endif()
+    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
  
      file(GLOB   GGML_HEADERS_CUDA "*.cuh")
      list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
author	Johannes Gäßler <redacted>
	Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
committer	GitHub <redacted>
	Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
docs/build.md		patch \| blob \| history
ggml/src/ggml-cuda/CMakeLists.txt		patch \| blob \| history