#include "ggml-cuda.h"
#include "ggml.h"
+#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define QK4_0 32
#define QR4_0 2
-#define QI4_0 4
+#define QI4_0 (QK4_0 / (4 * QR4_0))
typedef struct {
half d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
#define QK4_1 32
#define QR4_1 2
-#define QI4_1 4
+#define QI4_1 (QK4_1 / (4 * QR4_1))
typedef struct {
half d; // delta
half m; // min
#define QK5_0 32
#define QR5_0 2
-#define QI5_0 4
+#define QI5_0 (QK5_0 / (4 * QR5_0))
typedef struct {
half d; // delta
uint8_t qh[4]; // 5-th bit of quants
#define QK5_1 32
#define QR5_1 2
-#define QI5_1 4
+#define QI5_1 (QK5_1 / (4 * QR5_1))
typedef struct {
half d; // delta
half m; // min
#define QK8_0 32
#define QR8_0 1
-#define QI8_0 8
+#define QI8_0 (QK8_0 / (4 * QR8_0))
typedef struct {
half d; // delta
int8_t qs[QK8_0]; // quants
#define QK8_1 32
#define QR8_1 1
-#define QI8_1 8
+#define QI8_1 (QK8_1 / (4 * QR8_1))
typedef struct {
half d; // delta
half s; // unquantized sum
#define K_SCALE_SIZE 12
#endif
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
} block_q3_K;
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
#ifdef GGML_QKK_64
typedef struct {
half d[2]; // super-block scales/mins
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
#endif
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
#ifdef GGML_QKK_64
typedef struct {
half d; // super-block scale
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
-static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
- if (i >= k) {
+ if (i >= kx) {
return;
}
- dst[i] = x[i] + y[i];
+ dst[i] = x[i] + y[i%ky];
}
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
y[iybs + iqs + y_offset] = v.y;
}
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
int vi;
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
int qs;
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
int vi;
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+ float sumf_d = 0.0f;
+ float sumf_m = 0.0f;
+
+ const float d = bq2_K->d;
+ const float dmin = bq2_K->dmin;
+
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
+
+ for (int i = 0; i < QR2_K; ++i) {
+ const int sc = bq2_K->scales[scale_offset + 2*i];
+
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+ const float d8i = bq8i->d;
+
+ const int vi = (v >> (2*i)) & 0x03030303;
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
+ }
+
+ return d*sumf_d - dmin*sumf_m;
+#else
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+ float sumf = 0.0f;
+
+ const float d = bq3_K->d;
+
+ int vl;
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
+
+ int vh;
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
+ vh >>= bq8_offset;
+
+ for (int i = 0; i < QR3_K; ++i) {
+ const int isc = scale_offset + 2*i;
+
+ const int isc_low = isc % (QK_K/32);
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
+
+ const int isc_high = isc % (QK_K/64);
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+ const int sc = (sc_low | sc_high) - 32;
+
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+ const float d8i = bq8i->d;
+
+ const int vil = (vl >> (2*i)) & 0x03030303;
+
+ const int vih = ((vh >> i) << 2) & 0x04040404;
+
+ const int vi = __vsubss4(vil, vih);
+
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+ }
+
+ return d*sumf;
+#else
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+ const int bq8_offset = QR4_K * (iqs / QI8_1);
+
+ float sumf_d = 0.0f;
+ float sumf_m = 0.0f;
+
+ const float d = bq4_K->d;
+ const float dmin = bq4_K->dmin;
+
+ const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
+
+ for (int i = 0; i < QR4_K; ++i) {
+ const int isc = bq8_offset + i;
+
+ uint8_t sc, m;
+ get_scale_min_k4(isc, bq4_K->scales, sc, m);
+
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+ const float d8i = bq8i->d;
+
+ const int vi = (v >> (4*i)) & 0x0F0F0F0F;
+
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
+ }
+
+ return d*sumf_d - dmin*sumf_m;
+#else
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+ const int bq8_offset = QR5_K * (iqs / QI8_1);
+
+ float sumf_d = 0.0f;
+ float sumf_m = 0.0f;
+
+ const float d = bq5_K->d;
+ const float dmin = bq5_K->dmin;
+
+ const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
+
+ const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
+
+ for (int i = 0; i < QR5_K; ++i) {
+ const int isc = bq8_offset + i;
+
+ uint8_t sc, m;
+ get_scale_min_k4(isc, bq5_K->scales, sc, m);
+
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+ const float d8i = bq8i->d;
+
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+ const int vih = ((vh >> i) << 4) & 0x10101010;
+
+ const int vi = vil | vih;
+
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
+ }
+
+ return d*sumf_d - dmin*sumf_m;
+#else
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+ float sumf = 0.0f;
+
+ const float d = bq6_K->d;
+
+ int vl;
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
+
+ int vh;
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
+
+ for (int i = 0; i < QR6_K; ++i) {
+ const int sc = bq6_K->scales[scale_offset + 4*i];
+
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
+ const float d8i = bq8i->d;
+
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
+
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+ }
+
+ return d*sumf;
+#else
+ return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
- const int iby = i + threadIdx.x / qi; // y block index
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
dst[i] = scale * x[i];
}
-static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
}
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
}
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % QK4_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
}
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % QK4_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
}
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % QK5_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
}
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % QK5_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
}
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % QK_K == 0);
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+ const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % QK_K == 0);
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+ const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % QK_K == 0);
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+ const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % QK_K == 0);
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+ const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % QK_K == 0);
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+ const dim3 block_nums(1, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
GGML_ASSERT(src1_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
- // TODO: support broadcasting
- GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
-
const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;
- // const int64_t ne10 = src1->ne[0];
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
// compute
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
} else {
GGML_ASSERT(dst_ddf_i != nullptr);
const int64_t ne00 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
-
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
-
- // compute
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
- }
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
(void) dst;
(void) src0_ddq_i;
int id;
CUDA_CHECK(cudaGetDevice(&id));
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
+ bool mul_mat_vec_q_implemented =
+ src0->type == GGML_TYPE_Q4_0 ||
src0->type == GGML_TYPE_Q4_1 ||
src0->type == GGML_TYPE_Q5_0 ||
src0->type == GGML_TYPE_Q5_1 ||
src0->type == GGML_TYPE_Q8_0;
-
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
+#if QK_K == 256
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
+ src0->type == GGML_TYPE_Q2_K ||
+ src0->type == GGML_TYPE_Q3_K ||
+ src0->type == GGML_TYPE_Q4_K ||
+ src0->type == GGML_TYPE_Q5_K ||
+ src0->type == GGML_TYPE_Q6_K;
+#endif // QK_K == 256
+
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
#endif
if (use_mul_mat_vec_q) {
case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
+ case GGML_TYPE_Q2_K:
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q3_K:
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q4_K:
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_K:
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q6_K:
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
default:
GGML_ASSERT(false);
break;
delete extra;
}
+static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+ if (g_temp_tensor_extras == nullptr) {
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+ }
+
+ size_t alloc_index = g_temp_tensor_extra_index;
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+ memset(extra, 0, sizeof(*extra));
+
+ return extra;
+}
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
if (scratch && g_scratch_size == 0) {
return;
}
tensor->backend = GGML_BACKEND_GPU;
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
- memset(extra, 0, sizeof(*extra));
+ struct ggml_tensor_extra_gpu * extra;
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW ||
if (tensor->op == GGML_OP_VIEW) {
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
}
+ extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src0_ddc + offset;
} else if (tensor->op == GGML_OP_CPY) {
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
void * src1_ddv = src1_extra->data_device[g_main_device];
+ extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src1_ddv;
} else if (scratch) {
GGML_ASSERT(size <= g_scratch_size);
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
g_scratch_buffer = data;
}
+ extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = data + g_scratch_offset;
g_scratch_offset += size;
void * data;
CUDA_CHECK(cudaMalloc(&data, size));
CUDA_CHECK(cudaMemset(data, 0, size));
+ extra = new ggml_tensor_extra_gpu;
+ memset(extra, 0, sizeof(*extra));
extra->data_device[g_main_device] = data;
}
#include <unistd.h>
#endif
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
+#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
#endif
#endif
-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
-
/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
int n_past,
int n_dims,
int mode,
+ float freq_base,
+ float freq_scale,
int n_ctx,
bool inplace) {
GGML_ASSERT(n_past >= 0);
ggml_scratch_save(ctx);
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
+ memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
+ memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
ggml_scratch_load(ctx);
int n_dims,
int mode,
int n_ctx) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
}
struct ggml_tensor * ggml_rope_inplace(
int n_dims,
int mode,
int n_ctx) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
+}
+
+struct ggml_tensor * ggml_rope_custom_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode,
+ float freq_base,
+ float freq_scale,
+ int n_ctx) {
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
}
// ggml_rope_back
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
- GGML_ASSERT(ggml_nelements(src1) == 4);
+ GGML_ASSERT(ggml_nelements(src1) == 6);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
+ float freq_base;
+ float freq_scale;
+
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
// row index used to determine which thread to use
int ir = 0;
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
if (ir++ < ir0) continue;
if (ir > ir1) break;
- float theta = (float)p;
+ float theta = freq_scale * (float)p;
if (is_glm) {
theta = MIN(p, n_ctx - 2);
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
- GGML_ASSERT(ggml_nelements(src1) == 4);
+ GGML_ASSERT(ggml_nelements(src1) == 6);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
+ float freq_base;
+ float freq_scale;
+
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
// row index used to determine which thread to use
int ir = 0;
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
if (ir++ < ir0) continue;
if (ir > ir1) break;
- float theta = (float)p;
+ float theta = freq_scale * (float)p;
if (is_glm) {
theta = MIN(p, n_ctx - 2);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
// necessary for llama
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 4);
+ assert(ggml_nelements(src1) == 6);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
{
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 4);
+ assert(ggml_nelements(src1) == 3);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];