From: Johannes Gäßler Date: Mon, 29 Dec 2025 16:58:20 +0000 (+0100) Subject: CUDA: fix replacment of bad archs in CMake (#18457) X-Git-Tag: upstream/0.0.7599~24 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=0bd1212a43561a55d89d468b7ca5265647dab1fd;p=pkg%2Fggml%2Fsources%2Fllama.cpp CUDA: fix replacment of bad archs in CMake (#18457) --- diff --git a/docs/build.md b/docs/build.md index 4a691177..63fd8b4f 100644 --- a/docs/build.md +++ b/docs/build.md @@ -150,19 +150,38 @@ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in ### Compilation + +Make sure to read the notes about the CPU build for general instructions for e.g. speeding up the compilation. + ```bash cmake -B build -DGGML_CUDA=ON cmake --build build --config Release ``` +### Non-Native Builds + +By default llama.cpp will be built for the hardware that is connected to the system at that time. +For a build covering all CUDA GPUs, disable `GGML_NATIVE`: + +```bash +cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=OFF +``` + +The resulting binary should run on all CUDA GPUs with optimal performance, though some just-in-time compilation may be required. + ### Override Compute Capability Specifications -If `nvcc` cannot detect your gpu, you may get compile-warnings such as: +If `nvcc` cannot detect your gpu, you may get compile warnings such as: ```text nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used ``` -To override the `native` GPU detection: +One option is to do a non-native build as described above. +However, this will result in a large binary that takes a long time to compile. +Alternatively it is also possible to explicitly specify CUDA architectures. +This may also make sense for a non-native build, for that one should look at the logic in `ggml/src/ggml-cuda/CMakeLists.txt` as a starting point. + +To override the default CUDA architectures: #### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus). diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 73f2910b..ae8f963f 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -51,35 +51,35 @@ if (CUDAToolkit_FOUND) endif() endif() endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") enable_language(CUDA) - # Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x - if (GGML_NATIVE) - set(PROCESSED_ARCHITECTURES "") - if (CMAKE_CUDA_ARCHITECTURES_NATIVE) - set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) - else() - set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES}) - endif() - foreach(ARCH ${ARCH_LIST}) + # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa. + # 12X is forwards-compatible, 12Xa is not. + # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa. + # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code. + # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released. + foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE) + set(FIXED_ARCHS "") + foreach(ARCH IN LISTS ${ARCHS}) if (ARCH MATCHES "^12[0-9](-real|-virtual)?$") - string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH}) - message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real") - list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real") + string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH}) + message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}") + list(APPEND FIXED_ARCHS "${FIXED_ARCH}") else() - list(APPEND PROCESSED_ARCHITECTURES ${ARCH}) - endif() - endforeach() - set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES}) - else() - foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES}) - if(ARCH MATCHES "^12[0-9](-real|-virtual)?$") - message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell-specific optimizations") + list(APPEND FIXED_ARCHS "${ARCH}") endif() endforeach() + set(${ARCHS} ${FIXED_ARCHS}) + endforeach() + + # If we try to compile a "native" build it will use the 12X architectures and fail. + # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa. + # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use. + if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$") + set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) endif() + message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}") file(GLOB GGML_HEADERS_CUDA "*.cuh") list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")