Co-authored-by: Johannes Gäßler <redacted>
}
static bool bf16_mma_hardware_available(const int cc) {
- return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+}
+
+static bool fp32_mma_hardware_available(const int cc) {
+ return GGML_CUDA_CC_IS_CDNA(cc);
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
return ne11 <= 4;
}
return ne11 <= 3;
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+ if (fp32_mma_hardware_available(cc)) {
+ return ne11 <= 3;
+ }
+ return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_F16:
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+ if (fp16_mma_hardware_available(cc)) {
+ if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+ return ne11 <= 5;
+ }
+ return ne11 <= 2;
+ }
+ return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_BF16:
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+ if (bf16_mma_hardware_available(cc)) {
+ return ne11 <= 3;
+ }
+ return ne11 <= 8;
}
return ne11 <= 8;
default: