From: Georgi Gerganov Date: Wed, 12 Jul 2023 17:38:49 +0000 (+0300) Subject: cuda : sync llama.cpp X-Git-Tag: upstream/0.0.1642~1322 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=56216523fa1df0c0bc36201dbecd8e0a01668d91;p=pkg%2Fggml%2Fsources%2Fggml cuda : sync llama.cpp --- diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 55401c29..dc4b773a 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -1273,7 +1273,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ } static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int vi; @@ -1294,11 +1294,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); @@ -1319,11 +1319,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; int qs; @@ -1354,11 +1354,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); @@ -1388,11 +1388,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; int vi; @@ -1407,7 +1407,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } template @@ -2459,7 +2459,7 @@ inline void ggml_cuda_op_mul_mat_vec( src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented; + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) {