cuda 12.8 added the option to specify stronger compression for binaries, so we now default to "size".
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
+set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
+ "ggml: cuda link binary compression mode; requires cuda 12.8+")
+set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
set(CUDA_FLAGS -use_fast_math)
+ if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
+ # Options are:
+ # - none (not recommended)
+ # - speed (nvcc's default)
+ # - balance
+ # - size
+ list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
+ endif()
+
if (GGML_FATAL_WARNINGS)
list(APPEND CUDA_FLAGS -Werror all-warnings)
endif()