From: Beinsezii Date: Tue, 6 Jan 2026 15:26:07 +0000 (-0800) Subject: mmq.cu: tune mmq/rocblas switching for RDNA (#18537) X-Git-Tag: upstream/0.0.7721~76 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=968929528c6a05e10249366fbe5f0330ad9af678;p=pkg%2Fggml%2Fsources%2Fllama.cpp mmq.cu: tune mmq/rocblas switching for RDNA (#18537) * Patch perf regression for mmq kernels in ROCm recover performance regression for https://github.com/ggml-org/llama.cpp/issues/17917 * add n_experts branch like the cdna path * mmq.cu: tune mmq/wmma switching for RDNA * mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3 * Update ggml/src/ggml-cuda/mmq.cu Co-authored-by: Johannes Gäßler --------- Co-authored-by: Jiacheng (Jason) Chen Co-authored-by: jiachengjason Co-authored-by: Johannes Gäßler --- diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 85692d45..ceb95758 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,6 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { + // RDNA 4 is consistently worse on rocblas + // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 + if (GGML_CUDA_CC_IS_RDNA3(cc)) { + // High expert counts almost always better on MMQ + // due to a large amount of graph splits + // https://github.com/ggml-org/llama.cpp/pull/18202 + if (n_experts >= 64) { + return true; + } + + switch (type) { + // These quants are really bad on MMQ + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q6_K: + // These quants are usually worse but not always + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + return ne11 <= 128; + default: + return true; + } + } return true; }