CUDA: fix q_nope_absorbed prec for DS 2 Lite f16 (llama/13137)

author Johannes Gäßler <redacted>

Mon, 28 Apr 2025 07:29:26 +0000 (09:29 +0200)

committer Georgi Gerganov <redacted>

Thu, 1 May 2025 07:39:34 +0000 (10:39 +0300)
author Johannes Gäßler <redacted>
Mon, 28 Apr 2025 07:29:26 +0000 (09:29 +0200)
committer Georgi Gerganov <redacted>
Thu, 1 May 2025 07:39:34 +0000 (10:39 +0300)
diff --git a/include/ggml.h b/include/ggml.h

index 51aa5b3a0ab44309a1c6dbc0de0ebdd438b8e6c1..1b8603e78e55348c9fcd4db8c39451a55b47c295 100644 (file)
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -393,8 +393,8 @@ extern "C" {
  
      // precision
      enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
+        GGML_PREC_DEFAULT =  0, // stored as ggml_tensor.op_params, 0 by default
+        GGML_PREC_F32     = 10,
      };
  
      // model file types
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index e0e0d2137f3be28ff70304b5caf14c1e257142de..19b9ce7231aa29086b10077961cbe54baa7f0aca 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
          ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
      } else if (!split && use_mul_mat_vec_q) {
          ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
-    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
-               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
+            dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
          // general KQ + KQV multi-batch without FlashAttention
          ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
      } else if (use_mul_mat_vec) {
author	Johannes Gäßler <redacted>
	Mon, 28 Apr 2025 07:29:26 +0000 (09:29 +0200)
committer	Georgi Gerganov <redacted>
	Thu, 1 May 2025 07:39:34 +0000 (10:39 +0300)
include/ggml.h		patch \| blob \| history
src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history