From: Jeroen Mostert Date: Tue, 23 Jul 2024 08:50:40 +0000 (+0200) Subject: Allow all RDNA2 archs to use sdot4 intrinsic (llama/8629) X-Git-Tag: upstream/0.0.1642~502 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=5937ee2959ea5bb906a1b418ee1c05271c5dc387;p=pkg%2Fggml%2Fsources%2Fggml Allow all RDNA2 archs to use sdot4 intrinsic (llama/8629) The check gating the use of `__builtin_amdgc_sdot4` specifically checks for gfx1030. This causes a severe perf regression for anything gfx103? that's not gfx1030 and not using `HSA_OVERRIDE_GFX_VERSION` (if you've built ROCm to support it). We already have a generic RDNA2 define, let's use it. --- diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh index 26d9412a..1c2e00c1 100644 --- a/src/ggml-cuda/common.cuh +++ b/src/ggml-cuda/common.cuh @@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) c = __builtin_amdgcn_sdot4(a, b, c, false); #elif defined(RDNA3) c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);