CUDA: fix replacment of bad archs in CMake (llama/18457)

author Johannes Gäßler <redacted>

Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
author Johannes Gäßler <redacted>
Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt

index 73f2910b7c56cf81c43abf7a8653de497e282798..ae8f963f6991fc9d24546451b753e1f39a832ac2 100644 (file)
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -51,35 +51,35 @@ if (CUDAToolkit_FOUND)
              endif()
          endif()
      endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
  
      enable_language(CUDA)
  
-    # Replace any 12x-real architectures with 12x{a}-real. FP4 ptx instructions are not available in just 12x
-    if (GGML_NATIVE)
-        set(PROCESSED_ARCHITECTURES "")
-        if (CMAKE_CUDA_ARCHITECTURES_NATIVE)
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
-        else()
-            set(ARCH_LIST ${CMAKE_CUDA_ARCHITECTURES})
-        endif()
-        foreach(ARCH ${ARCH_LIST})
+    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
+    # 12X is forwards-compatible, 12Xa is not.
+    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
+    # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
+    # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
+    foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
+        set(FIXED_ARCHS "")
+        foreach(ARCH IN LISTS ${ARCHS})
              if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                string(REGEX REPLACE "^(12[0-9]).*$" "\\1" BASE_ARCH ${ARCH})
-                message(STATUS "Replacing ${ARCH} with ${BASE_ARCH}a-real")
-                list(APPEND PROCESSED_ARCHITECTURES "${BASE_ARCH}a-real")
+                string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
+                message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
+                list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
              else()
-                list(APPEND PROCESSED_ARCHITECTURES ${ARCH})
-            endif()
-        endforeach()
-        set(CMAKE_CUDA_ARCHITECTURES ${PROCESSED_ARCHITECTURES})
-    else()
-        foreach(ARCH ${CMAKE_CUDA_ARCHITECTURES})
-            if(ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                message(FATAL_ERROR "Compute capability ${ARCH} used, use ${ARCH}a or ${ARCH}f for Blackwell-specific optimizations")
+                list(APPEND FIXED_ARCHS "${ARCH}")
              endif()
          endforeach()
+        set(${ARCHS} ${FIXED_ARCHS})
+    endforeach()
+
+    # If we try to compile a "native" build it will use the 12X architectures and fail.
+    # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
+    # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
+    if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
+        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
      endif()
+    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
  
      file(GLOB   GGML_HEADERS_CUDA "*.cuh")
      list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
author	Johannes Gäßler <redacted>
	Mon, 29 Dec 2025 16:58:20 +0000 (17:58 +0100)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)