option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
-option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
# for best performance and to also build real architectures for the most commonly used GPUs.
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
set(CMAKE_CUDA_ARCHITECTURES "native")
- elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
- if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
- set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
- else()
- set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
- endif()
else()
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
add_compile_definitions(GGML_CUDA_NO_FA)
endif()
- if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
- add_compile_definitions(GGML_CUDA_F16)
- endif()
-
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11010
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef float2 dfloat2;
-#endif // GGML_CUDA_F16
-
#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#define GGML_USE_VMM
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#endif // CUDART_VERSION >= 12050
}
-typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
+typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
static __device__ __forceinline__ float get_alibi_slope(
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize
- dfloat2 v;
+ float2 v;
dequantize_kernel(vx, ib, iqs, v);
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
#pragma unroll
for (int j = 0; j < QK8_0; j += 2) {
- dfloat2 dq;
+ float2 dq;
dequantize_q8_0(cxi, 0, j, dq);
*(cdstf + j) = dq.x;
*(cdstf + j + 1) = dq.y;
#pragma unroll
for (int j = 0; j < qk/2; j++) {
- dfloat2 dq;
+ float2 dq;
dequant(cxi, 0, j, dq);
*(cdstf + j) = dq.x;
*(cdstf + j + qk/2) = dq.y;
#include "common.cuh"
-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q4_0 * x = (const block_q4_0 *) vx;
- const dfloat d = x[ib].d;
+ const float d = x[ib].d;
const int vui = x[ib].qs[iqs];
v.x = vui & 0xF;
v.y = vui >> 4;
-#ifdef GGML_CUDA_F16
- v = __hsub2(v, {8.0f, 8.0f});
- v = __hmul2(v, {d, d});
-#else
v.x = (v.x - 8.0f) * d;
v.y = (v.y - 8.0f) * d;
-#endif // GGML_CUDA_F16
}
-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q4_1 * x = (const block_q4_1 *) vx;
- const dfloat d = __low2half(x[ib].dm);
- const dfloat m = __high2half(x[ib].dm);
+ const float2 dm = __half22float2(x[ib].dm);
const int vui = x[ib].qs[iqs];
v.x = vui & 0xF;
v.y = vui >> 4;
-#ifdef GGML_CUDA_F16
- v = __hmul2(v, {d, d});
- v = __hadd2(v, {m, m});
-#else
- v.x = (v.x * d) + m;
- v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
+ v.x = (v.x * dm.x) + dm.y;
+ v.y = (v.y * dm.x) + dm.y;
}
-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q5_0 * x = (const block_q5_0 *) vx;
- const dfloat d = x[ib].d;
+ const float d = x[ib].d;
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
-#ifdef GGML_CUDA_F16
- v = __hsub2(v, {16.0f, 16.0f});
- v = __hmul2(v, {d, d});
-#else
v.x = (v.x - 16.0f) * d;
v.y = (v.y - 16.0f) * d;
-#endif // GGML_CUDA_F16
}
-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q5_1 * x = (const block_q5_1 *) vx;
- const dfloat d = __low2half(x[ib].dm);
- const dfloat m = __high2half(x[ib].dm);
+ const float2 dm = __half22float2(x[ib].dm);
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
-#ifdef GGML_CUDA_F16
- v = __hmul2(v, {d, d});
- v = __hadd2(v, {m, m});
-#else
- v.x = (v.x * d) + m;
- v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
+ v.x = (v.x * dm.x) + dm.y;
+ v.y = (v.y * dm.x) + dm.y;
}
-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q8_0 * x = (const block_q8_0 *) vx;
- const dfloat d = x[ib].d;
+ const float d = x[ib].d;
v.x = x[ib].qs[iqs + 0];
v.y = x[ib].qs[iqs + 1];
-#ifdef GGML_CUDA_F16
- v = __hmul2(v, {d, d});
-#else
v.x *= d;
v.y *= d;
-#endif // GGML_CUDA_F16
}
const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize
- dfloat2 v;
+ float2 v;
dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
features.push_back({ "NO_PEER_COPY", "1" });
#endif
- #ifdef GGML_CUDA_F16
- features.push_back({ "F16", "1" });
- #endif
-
#ifdef GGML_CUDA_USE_GRAPHS
features.push_back({ "USE_GRAPHS", "1" });
#endif
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
}
-#ifdef GGML_CUDA_F16
+#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
const float d4d8 = tmp.x;
const float m4s8 = tmp.y;
const float2 ds8f = __half22float2(ds8);
const float d4d8 = dm4f.x * ds8f.x;
const float m4s8 = dm4f.y * ds8f.y;
-#endif // GGML_CUDA_F16
+#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
}
-#ifdef GGML_CUDA_F16
+#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
const float d5d8 = tmp.x;
const float m5s8 = tmp.y;
const float2 ds8f = __half22float2(ds8);
const float d5d8 = dm5f.x * ds8f.x;
const float m5s8 = dm5f.y * ds8f.y;
-#endif // GGML_CUDA_F16
+#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
-#ifdef GGML_CUDA_F16
+#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
const float d8d8 = tmp.x;
const float m8s8 = tmp.y;
const float2 ds8f = __half22float2(ds8);
const float d8d8 = dm8f.x * ds8f.x;
const float m8s8 = dm8f.y * ds8f.y;
-#endif // GGML_CUDA_F16
+#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
add_compile_definitions(GGML_CUDA_NO_FA)
endif()
- if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
- add_compile_definitions(GGML_CUDA_F16)
- endif()
-
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()