#include "common.cuh"
+#include <cstdint>
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}
+static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
+ const uint16_t * x16 = (const uint16_t *) x;
+
+ int x32 = x16[2*i32 + 0] << 0;
+ x32 |= x16[2*i32 + 1] << 16;
+
+ return x32;
+}
+
+static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
+ return ((const int *) x)[i32]; // assume at least 4 byte alignment
+}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
const int * v, const int * u, const float & d4, const half2 & ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
// SIMD dot product of quantized values
- sumi = __dp4a(vi0, u[2*i+0], sumi);
- sumi = __dp4a(vi1, u[2*i+1], sumi);
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
}
const float2 ds8f = __half22float2(ds8);
// second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q4_1_Q8_1_MMVQ 2
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
// SIMD dot product of quantized values
- sumi = __dp4a(vi0, u[2*i+0], sumi);
- sumi = __dp4a(vi1, u[2*i+1], sumi);
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
}
#ifdef GGML_CUDA_F16
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_0_Q8_1_MMVQ 2
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
- sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
- sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
}
const float2 ds8f = __half22float2(ds8);
// second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_1_Q8_1_MMVQ 2
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
- sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
- sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
}
#ifdef GGML_CUDA_F16
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q8_0_Q8_1_MMVQ 2
template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
const int * v, const int * u, const T & d8_0, const T & d8_1) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
for (int i = 0; i < vdr; ++i) {
// SIMD dot product of quantized values
- sumi = __dp4a(v[i], u[i], sumi);
+ sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
return d8_0*d8_1 * ((T) sumi);
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
for (int i = 0; i < vdr; ++i) {
// SIMD dot product of quantized values
- sumi = __dp4a(v[i], u[i], sumi);
+ sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
#ifdef GGML_CUDA_F16
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q2_K_Q8_1_MMVQ 1
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
const half2 & dm2, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const int vi = (v >> (2*i)) & 0x03030303;
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+ sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
// fill int with 4x m
int m = sc >> 4;
m |= m << 8;
m |= m << 16;
- sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+ sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
}
const float2 dm2f = __half22float2(dm2);
return dm2f.x*sumf_d - dm2f.y*sumf_m;
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
#pragma unroll
for (int i = i0; i < i0 + QI8_1/2; ++i) {
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
- sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product
- sumi_m = __dp4a(0x01010101, u[i], sumi_m);
+ sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product
+ sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m);
}
sumf_d += dm2f.x * sumi_d;
}
return d8*(sumf_d - sumf_m);
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q3_K_Q8_1_MMVQ 1
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf = 0.0f;
#pragma unroll
const int vi = __vsubss4(vil, vih);
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+ sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
}
return d3 * sumf;
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
const float & d3, const float & d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi = 0;
#pragma unroll
#pragma unroll
for (int i = i0; i < i0 + QI8_1/2; ++i) {
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
- sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product
+ sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product
}
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
}
return d3*d8 * sumi;
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q4_K_Q8_1_MMVQ 2
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+ const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+ const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
sumf_d += d8[i] * (dot1 * sc[i]);
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
const float2 dm4f = __half22float2(dm4);
return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
#pragma unroll
for (int j = 0; j < QI8_1; ++j) {
- sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+ sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
}
const float2 ds8f = __half22float2(ds8[i]);
const float2 dm4f = __half22float2(dm4);
return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_K_Q8_1_MMVQ 2
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const int v0i = vl0i | vh0i;
const int v1i = vl1i | vh1i;
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+ const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+ const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u
sumf_d += d8[i] * (dot1 * sc[i]);
sumf_m += d8[i] * (dot2 * m[i]);
const float2 dm5f = __half22float2(dm5);
return dm5f.x*sumf_d - dm5f.y*sumf_m;
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
float sumf_m = 0.0f;
#pragma unroll
for (int j = 0; j < QI8_1; ++j) {
- sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+ sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
}
const float2 ds8f = __half22float2(ds8[i]);
const float2 dm4f = __half22float2(dm4);
return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q6_K_Q8_1_MMVQ 1
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
const float & d, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf = 0.0f;
#pragma unroll
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+ sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
}
return d*sumf;
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// contiguous u/y values
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
const float & d6, const float * __restrict__ d8) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float sumf_d = 0.0f;
#pragma unroll
#pragma unroll
for (int i = i0; i < i0 + 2; ++i) {
- sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
- sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+ sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+ sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
- sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
- sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+ sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+ sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
}
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
}
return d6 * sumf_d;
-
-#else
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
#pragma unroll
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
- v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+ v[i] = get_int_b2(bq4_0->qs, iqs + i);
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0);
}
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
#pragma unroll
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
- v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+ v[i] = get_int_b4(bq4_1->qs, iqs + i);
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1);
}
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
#pragma unroll
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
- vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
- vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+ vl[i] = get_int_b2(bq5_0->qs, iqs + i);
+ vh[i] = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i));
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0);
}
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
#pragma unroll
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
- vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
- vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+ vl[i] = get_int_b4(bq5_1->qs, iqs + i);
+ vh[i] = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i));
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1);
}
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
#pragma unroll
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
- v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
- u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+ v[i] = get_int_b2(bq8_0->qs, iqs + i);
+ u[i] = get_int_b4(bq8_1->qs, iqs + i);
}
return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
const uint8_t * scales = bq2_K->scales + scale_offset;
- const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+ const int v = get_int_b4(bq2_K->qs, iqs);
int u[QR2_K];
float d8[QR2_K];
#pragma unroll
for (int i = 0; i < QR2_K; ++ i) {
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+ u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
}
const float d = bq3_K->d;
- const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+ const int vl = get_int_b2(bq3_K->qs, iqs);
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
- const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+ const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
int u[QR3_K];
float d8[QR3_K];
#pragma unroll
for (int i = 0; i < QR3_K; ++i) {
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+ u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
}
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
- const int vl = get_int_from_uint8(bq6_K->ql, iqs);
- const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+ const int vl = get_int_b2(bq6_K->ql, iqs);
+ const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
const int8_t * scales = bq6_K->scales + scale_offset;
#pragma unroll
for (int i = 0; i < QR6_K; ++i) {
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+ u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
}
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
}
+#define VDR_IQ2_XXS_Q8_1_MMVQ 2
+
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
-#if QR2_XXS == 8
- const int ib32 = iqs;
- const uint16_t * q2 = bq2->qs + 4*ib32;
- const uint8_t * aux8 = (const uint8_t *)q2;
- const int8_t * q8 = bq8_1[ib32].qs;
- uint32_t aux32 = q2[2] | (q2[3] << 16);
+ const int q2 = get_int_b2(bq2->qs, iqs);
+ const uint8_t * aux8 = (const uint8_t *) &q2;
+ const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1);
+
int sumi = 0;
- for (int l = 0; l < 4; ++l) {
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
- const uint8_t signs = ksigns_iq2xs[aux32 & 127];
- for (int j = 0; j < 8; ++j) {
- sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
- }
- q8 += 8;
- aux32 >>= 7;
+#pragma unroll
+ for (int k0 = 0; k0 < 8; k0 += 2) {
+ const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
+ const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
+
+ const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
+ const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
+ sumi = ggml_cuda_dp4a(grid0, u0, sumi);
+
+ const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
+ const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
+ sumi = ggml_cuda_dp4a(grid1, u1, sumi);
}
- const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
+
+ const int ls = aux32 >> 28;
+ sumi = (ls*sumi + sumi/2)/4;
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
return d * sumi;
-#else
- // iqs is 0...15
- const int ib32 = iqs/2;
- const int il = iqs%2;
- const uint16_t * q2 = bq2->qs + 4*ib32;
- const uint8_t * aux8 = (const uint8_t *)q2;
- const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
- const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
- const uint32_t aux32 = q2[2] | (q2[3] << 16);
- const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
- const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
- const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
- const int8_t * q8 = bq8_1[ib32].qs + 16*il;
- int sumi1 = 0, sumi2 = 0;
- for (int j = 0; j < 8; ++j) {
- sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
- sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
- }
- return d * (sumi1 + sumi2);
-#endif
}
+#define VDR_IQ2_XS_Q8_1_MMVQ 2
+
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
- const int ib32 = iqs;
- const uint16_t * q2 = bq2->qs + 4*ib32;
- const int8_t * q8 = bq8_1[ib32].qs;
- const uint8_t ls1 = bq2->scales[ib32] & 0xf;
- const uint8_t ls2 = bq2->scales[ib32] >> 4;
+ const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
+ const uint16_t * q2 = (const uint16_t *) &q2_packed;
+ const int ls0 = bq2->scales[iqs/2] & 0x0F;
+ const int ls1 = bq2->scales[iqs/2] >> 4;
+
+ int sumi0 = 0;
int sumi1 = 0;
- for (int l = 0; l < 2; ++l) {
- const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
- const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
- const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
- sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
- sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
- q8 += 8;
- }
- int sumi2 = 0;
- for (int l = 2; l < 4; ++l) {
- const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
- const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
- const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
- sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
- sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
- q8 += 8;
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
+
+ const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
+ const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+ if (l0 < 4) {
+ sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
+ sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
+ } else {
+ sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
+ sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
+ }
}
- const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
- return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
- GGML_UNUSED(ksigns64);
- NO_DEVICE_CODE;
-#endif
+ const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+ return d * sumi;
}
-// TODO
+#define VDR_IQ2_S_Q8_1_MMVQ 2
+
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
- const int ib32 = iqs;
- const int8_t * q8 = bq8_1[ib32].qs;
- const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
- const uint8_t ls1 = bq2->scales[ib32] & 0xf;
- const uint8_t ls2 = bq2->scales[ib32] >> 4;
+ const int qs_packed = get_int_b2(bq2->qs, iqs/2);
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
+
+ const int qh = bq2->qh[iqs/2];
+
+ const int signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2);
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
+
+ const int ls0 = bq2->scales[iqs/2] & 0x0F;
+ const int ls1 = bq2->scales[iqs/2] >> 4;
+
+ int sumi0 = 0;
int sumi1 = 0;
- for (int l = 0; l < 2; ++l) {
- const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
- const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
- const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
- const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
- const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
- sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
- sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
- q8 += 8;
- }
- int sumi2 = 0;
- for (int l = 2; l < 4; ++l) {
- const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
- const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
- const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
- const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
- const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
- sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
- sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
- q8 += 8;
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300)));
+
+ const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
+ const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
+
+ const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
+ const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+ if (l0 < 4) {
+ sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
+ sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
+ } else {
+ sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
+ sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
+ }
}
- const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
- return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
- GGML_UNUSED(ksigns64);
- NO_DEVICE_CODE;
-#endif
+ const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
+
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+ return d * sumi;
}
+#define VDR_IQ3_XXS_Q8_1_MMVQ 2
+
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
- const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq + kbx;
-
- const int ib32 = iqs;
- const uint8_t * q3 = bq2->qs + 8*ib32;
- const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
- const int8_t * q8 = bq8_1[ib32].qs;
- uint32_t aux32 = gas[0] | (gas[1] << 16);
+
+ const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
+
+ const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
+ const uint8_t * q3 = (const uint8_t *) &q3_packed;
+ const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2);
+
int sumi = 0;
- for (int l = 0; l < 4; ++l) {
- const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
- const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
- const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
- const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
- sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
- sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
- q8 += 8;
- aux32 >>= 7;
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
+
+ const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
+
+ const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
+ const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+ sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
+ sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
}
- const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
+
+ const int ls = aux32 >> 28;
+ sumi = (ls*sumi + sumi/2)/2;
+ const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
return d * sumi;
-#else
- NO_DEVICE_CODE;
-#endif
}
+#define VDR_IQ3_S_Q8_1_MMVQ 2
+
// TODO: don't use lookup table for signs
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
- const block_iq3_s * bq2 = (const block_iq3_s *) vbq + kbx;
- const int ib32 = iqs;
- const uint8_t * qs = bq2->qs + 8*ib32;
- const int8_t * q8 = bq8_1[ib32].qs;
+ const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
+
+ const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
+
+ const int qh = bq3->qh[iqs/2];
+
+ const int signs_packed_32 = get_int_b2(bq3->signs, iqs/2);
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
+
int sumi = 0;
- for (int l = 0; l < 4; ++l) {
- const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
- const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
- uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
- uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
- const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
- const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
- sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
- sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
- q8 += 8;
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const int2 grid_pos = make_int2(
+ iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)],
+ iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]);
+
+ const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
+ const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
+
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
+
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+ sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
+ sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
}
- const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
+
+ sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
+
+ const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
return d * sumi;
-#else
- NO_DEVICE_CODE;
-#endif
}
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
- const int ib32 = iqs;
+ const int qs_packed = get_int_b2(bq1->qs, iqs);
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
+
+ const int qh = bq1->qh[iqs];
+
int sumi = 0;
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
- const int * q8 = (const int *)bq8_1[ib32].qs;
- for (int l = 0; l < 4; ++l) {
- const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
- int grid0 = grid[0] & 0x0f0f0f0f;
- int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
- sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
- }
-#else
- const int8_t * q8 = bq8_1[ib32].qs;
- for (int l = 0; l < 4; ++l) {
- const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
- for (int j = 0; j < 4; ++j) {
- sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
- }
- q8 += 8;
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
+
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+ const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+ sumi = ggml_cuda_dp4a(grid0, u0, sumi);
+ sumi = ggml_cuda_dp4a(grid1, u1, sumi);
}
-#endif
- const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
- const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
- const float d = d1q * __low2float (bq8_1[ib32].ds);
- const float m = d1q * __high2float(bq8_1[ib32].ds);
- return d * sumi + m * delta;
+
+ const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
+ const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+ const float2 ds = __half22float2(bq8_1[iqs].ds);
+ return d1q * (ds.x*sumi + ds.y*delta);
}
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
- const int ib32 = iqs;
- int sumi[2] = {0, 0};
- float sumf[2] = {0.f, 0.f};
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
- const int * q8 = (const int *)bq8_1[ib32].qs;
- for (int l = 0; l < 4; ++l) {
- const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
- int grid0 = grid[0] & 0x0f0f0f0f;
- int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
- sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2]));
- const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
- const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0));
- sumf[l/2] += delta*sumy;
- }
-#else
- const int8_t * q8 = bq8_1[ib32].qs;
- for (int l = 0; l < 4; ++l) {
- const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
+ const int qs_packed = get_int_b4(bq1->qs, iqs);
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
+
+ int sumi[2] = {0};
+ float sumf[2] = {0.0f};
+#pragma unroll
+ for (int l0 = 0; l0 < 8; l0 += 2) {
+ const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
+
+ const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
+
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+ const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+ const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+ sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]);
+ sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]);
+
+ const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
int sumy = 0;
- for (int j = 0; j < 4; ++j) {
- sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
- sumy += q8[j] + q8[j+4];
- }
- const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
- sumf[l/2] += delta*sumy;
- q8 += 8;
+ sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy);
+ sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy);
+ sumf[l0/4] += delta*sumy;
}
-#endif
+
+ const uint16_t * sc = (const uint16_t *) bq1->scales;
+
iq1m_scale_t scale;
- const uint16_t * sc = (const uint16_t *)bq1->scales;
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
- const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
- return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
+ const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+
+ const int tmp = sc[iqs/2] >> (6*(iqs%2));
+ const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
+ const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
+ return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
}
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
- int & val1, int & val2) {
-
- uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
- aux32 = q4 & 0x0f0f0f0f;
- uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
- uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
- val1 = v1 | (v2 << 16);
- aux32 = (q4 >> 4) & 0x0f0f0f0f;
- v1 = values[q8[0]] | (values[q8[1]] << 8);
- v2 = values[q8[2]] | (values[q8[3]] << 8);
- val2 = v1 | (v2 << 16);
+static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
+ const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
+ const int8_t * q0_8 = (const int8_t *) &q0_32;
+ const char4 val0_8 = make_char4(
+ kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]);
+
+ const int q1_32 = (q4 >> 4) & 0x0F0F0F0F;
+ const int8_t * q1_8 = (const int8_t *) &q1_32;
+ const char4 val1_8 = make_char4(
+ kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]);
+
+ return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
}
-#endif
+
+#define VDR_IQ4_NL_Q8_1_MMVQ 2
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
- const block_iq4_nl * bq = (const block_iq4_nl *) vbq + kbx;
+ const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
- const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
- const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
+ const int * q8 = (const int *) bq8_1->qs + iqs;
- const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
- int v1, v2;
- int sumi1 = 0, sumi2 = 0;
+ int sumi = 0;
+#pragma unroll
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
- const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
- get_int_from_table_16(aux, values, v1, v2);
- sumi1 = __dp4a(v1, q8[l+0], sumi1);
- sumi2 = __dp4a(v2, q8[l+4], sumi2);
- }
-
-#else
- const uint8_t * q4 = bq->qs + 4*iqs;
- const int8_t * q8 = bq8_1->qs + 4*iqs;
+ const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
+ const int2 v = get_int_from_table_16(aux_q4);
- int sumi1 = 0, sumi2 = 0;
- for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
- sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
- sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
+ sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
+ sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
}
-#endif
- const float d = (float)bq->d * __low2float(bq8_1->ds);
- return d * (sumi1 + sumi2);
+
+ const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
+ return d * sumi;
}
+#define VDR_IQ4_XS_Q8_1_MMVQ 4
+
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
- const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
- // iqs is 0...7
- const int ib32 = iqs;
- const int32_t * q8 = (const int *)bq8_1[ib32].qs;
- const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
- const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
- const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
- int v1, v2;
- int sumi1 = 0, sumi2 = 0;
+
+ int sumi = 0;
+#pragma unroll
for (int j = 0; j < 4; ++j) {
- get_int_from_table_16(q4[j], values, v1, v2);
- sumi1 = __dp4a(v1, q8[j+0], sumi1);
- sumi2 = __dp4a(v2, q8[j+4], sumi2);
+ const int aux_q4 = get_int_b4(bq4->qs, iqs + j);
+ const int2 v = get_int_from_table_16(aux_q4);
+
+ const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0);
+ const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4);
+
+ sumi = ggml_cuda_dp4a(v.x, u0, sumi);
+ sumi = ggml_cuda_dp4a(v.y, u1, sumi);
}
- return d * (sumi1 + sumi2);
-#else
- return vec_dot_iq4_xs_q8_1(vbq, bq8_1, kbx, iqs);
-#endif
+
+ const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
+ sumi *= ls - 32;
+
+ const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
+ return d * sumi;
}