option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
+option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
#define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
static bool amd_mfma_available(const int cc) {
- return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
+#if !defined(GGML_HIP_NO_MMQ_MFMA)
+ return GGML_CUDA_CC_IS_CDNA3(cc);
+#else
+ return false;
+#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
endif()
+if (NOT GGML_HIP_MMQ_MFMA)
+ add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
+endif()
+
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
endif()