From: R0CKSTAR Date: Tue, 19 Aug 2025 10:33:47 +0000 (+0800) Subject: musa: handle __hgt2_mask, available starting from MUSA SDK rc4.3.0 (#15413) X-Git-Tag: upstream/0.0.6527~323 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=67f09a3a27db443f9870aac87e163dba0d08131e;p=pkg%2Fggml%2Fsources%2Fllama.cpp musa: handle __hgt2_mask, available starting from MUSA SDK rc4.3.0 (#15413) Signed-off-by: Xiaodong Ye --- diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2b14b30a..76ace816 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -78,6 +78,8 @@ #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads +#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons + #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD @@ -490,13 +492,14 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP) } -#if CUDART_VERSION < CUDART_HMASK +#if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \ + (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); return mask_low | mask_high; } -#endif // CUDART_VERSION < CUDART_HMASK +#endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIP)