cuda : fix LLAMA_CUDA_F16 (llama/5262)

author slaren <redacted>

Thu, 1 Feb 2024 17:30:17 +0000 (18:30 +0100)

committer Georgi Gerganov <redacted>

Sat, 10 Feb 2024 07:55:46 +0000 (09:55 +0200)
author slaren <redacted>
Thu, 1 Feb 2024 17:30:17 +0000 (18:30 +0100)
committer Georgi Gerganov <redacted>
Sat, 10 Feb 2024 07:55:46 +0000 (09:55 +0200)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index e5659574217952230f7df457f91ac1f9eb3de11f..3242a0b4ad7bf8c0440810135ae66692747eb23f 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -8657,9 +8657,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
  
      if (src1_convert_f16) {
          src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, stream);
+        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+        GGML_ASSERT(to_fp16_cuda != nullptr);
+        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
      }
  #else
      const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
author	slaren <redacted>
	Thu, 1 Feb 2024 17:30:17 +0000 (18:30 +0100)
committer	Georgi Gerganov <redacted>
	Sat, 10 Feb 2024 07:55:46 +0000 (09:55 +0200)