}
#endif //__loongarch_asx
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_q4_0_ref(x, y, k);
}
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_q4_1_ref(x, y, k);
}
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_q5_0_ref(x, y, k);
}
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_q5_1_ref(x, y, k);
}
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32);
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
- block_q8_0 * restrict y = vy;
+ block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
#endif
}
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK8_1 == 0);
const int nb = k / QK8_1;
- block_q8_1 * restrict y = vy;
+ block_q8_1 * GGML_RESTRICT y = vy;
#if defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
return (i & 0x007fffff) - 0x00400000;
}
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
- const float * restrict qw) {
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
+ const float * GGML_RESTRICT qw) {
float max = 0;
float amax = 0;
for (int i = 0; i < n; ++i) {
return scale;
}
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
float max = 0;
float amax = 0;
for (int i = 0; i < n; ++i) {
return 1/iscale;
}
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
int ntry, float alpha) {
float min = x[0];
float max = x[0];
return scale;
}
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
float rmin, float rdelta, int nstep, bool use_mad) {
float min = x[0];
float max = x[0];
return scale;
}
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
if (j < 4) {
*d = q[j] & 63; *m = q[j + 4] & 63;
} else {
//========================- 2-bit (de)-quantization
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
quantize_row_q2_K_ref(x, vy, k);
}
//========================= 3-bit (de)-quantization
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
quantize_row_q3_K_ref(x, vy, k);
}
// ====================== 4-bit (de)-quantization
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
- block_q4_K * restrict y = vy;
+ block_q4_K * GGML_RESTRICT y = vy;
quantize_row_q4_K_ref(x, y, k);
}
// ====================== 5-bit (de)-quantization
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
- block_q5_K * restrict y = vy;
+ block_q5_K * GGML_RESTRICT y = vy;
quantize_row_q5_K_ref(x, y, k);
}
// ====================== 6-bit (de)-quantization
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
- block_q6_K * restrict y = vy;
+ block_q6_K * GGML_RESTRICT y = vy;
quantize_row_q6_K_ref(x, y, k);
}
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
-void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
- block_tq1_0 * restrict y = vy;
+ block_tq1_0 * GGML_RESTRICT y = vy;
quantize_row_tq1_0_ref(x, y, k);
}
-void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_K == 0);
- block_tq2_0 * restrict y = vy;
+ block_tq2_0 * GGML_RESTRICT y = vy;
quantize_row_tq2_0_ref(x, y, k);
}
//===================================== Q8_K ==============================================
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
#ifdef __wasm_simd128__
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
- block_q8_K * restrict yc = y; // Cast to proper type
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
for (int i = 0; i < nb; i++) {
const float * x_block = x + i * QK_K;
}
#endif
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
UNUSED(by);
UNUSED(bs);
- const block_q4_0 * restrict x = vx;
- const block_q8_0 * restrict y = vy;
+ const block_q4_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
- const block_q4_0 * restrict vx0 = vx;
- const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
- const block_q8_0 * restrict vy0 = vy;
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+ const block_q4_0 * GGML_RESTRICT vx0 = vx;
+ const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
- const block_q4_0 * restrict b_x0 = &vx0[i];
- const block_q4_0 * restrict b_x1 = &vx1[i];
- const block_q8_0 * restrict b_y0 = &vy0[i];
- const block_q8_0 * restrict b_y1 = &vy1[i];
+ const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
+ const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s8b = vdupq_n_s8(0x8);
const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
for (; ib + 1 < nb; ib += 2) {
- const block_q4_0 * restrict x0 = &x[ib + 0];
- const block_q4_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// load x
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
for (; ib + 1 < nb; ib += 2) {
- const block_q4_0 * restrict x0 = &x[ib + 0];
- const block_q4_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// load x
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
const svbool_t pl16 = svnot_b_z(ph32, ph16);
for (; ib + 1 < nb; ib += 2) {
- const block_q4_0 * restrict x0 = &x[ib + 0];
- const block_q4_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// load x
const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (; ib + 1 < nb; ib += 2) {
- const block_q4_0 * restrict x0 = &x[ib + 0];
- const block_q4_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s8b = vdupq_n_s8(0x8);
const v128_t s8b = wasm_i8x16_splat(0x8);
for (; ib + 1 < nb; ib += 2) {
- const block_q4_0 * restrict x0 = &x[ib];
- const block_q4_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// Load and process x0
v128_t v0_0 = wasm_v128_load(x0->qs);
*s = sumf;
}
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
UNUSED(by);
UNUSED(bs);
- const block_q4_1 * restrict x = vx;
- const block_q8_1 * restrict y = vy;
+ const block_q4_1 * GGML_RESTRICT x = vx;
+ const block_q8_1 * GGML_RESTRICT y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
- const block_q4_1 * restrict vx0 = vx;
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
- const block_q8_1 * restrict vy0 = vy;
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+ const block_q4_1 * GGML_RESTRICT vx0 = vx;
+ const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+ const block_q8_1 * GGML_RESTRICT vy0 = vy;
+ const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t summs0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
- const block_q4_1 * restrict b_x0 = &vx0[i];
- const block_q4_1 * restrict b_x1 = &vx1[i];
- const block_q8_1 * restrict b_y0 = &vy0[i];
- const block_q8_1 * restrict b_y1 = &vy1[i];
+ const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
+ const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
+ const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
+ const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
float32_t summs_t[4] = {
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
float summs = 0;
for (; ib + 1 < nb; ib += 2) {
- const block_q4_1 * restrict x0 = &x[ib + 0];
- const block_q4_1 * restrict x1 = &x[ib + 1];
- const block_q8_1 * restrict y0 = &y[ib + 0];
- const block_q8_1 * restrict y1 = &y[ib + 1];
+ const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
*s = sumf;
}
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
UNUSED(by);
UNUSED(bs);
- const block_q5_0 * restrict x = vx;
- const block_q8_0 * restrict y = vy;
+ const block_q5_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
uint64_t tmp1[4];
for (; ib + 1 < nb; ib += 2) {
- const block_q5_0 * restrict x0 = &x[ib];
- const block_q5_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
// TODO: check if unrolling this is better
for (; ib < nb; ++ib) {
- const block_q5_0 * restrict x0 = &x[ib];
- const block_q8_0 * restrict y0 = &y[ib];
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
const v128_t m4b = wasm_i8x16_splat(0x0F);
*s = sumf;
}
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
UNUSED(by);
UNUSED(bs);
- const block_q5_1 * restrict x = vx;
- const block_q8_1 * restrict y = vy;
+ const block_q5_1 * GGML_RESTRICT x = vx;
+ const block_q8_1 * GGML_RESTRICT y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
uint64_t tmp1[4];
for (; ib + 1 < nb; ib += 2) {
- const block_q5_1 * restrict x0 = &x[ib];
- const block_q5_1 * restrict x1 = &x[ib + 1];
- const block_q8_1 * restrict y0 = &y[ib];
- const block_q8_1 * restrict y1 = &y[ib + 1];
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
// TODO: check if unrolling this is better
for (; ib < nb; ++ib) {
- const block_q5_1 * restrict x0 = &x[ib];
- const block_q8_1 * restrict y0 = &y[ib];
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
*s = sumf;
}
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
UNUSED(by);
UNUSED(bs);
- const block_q8_0 * restrict x = vx;
- const block_q8_0 * restrict y = vy;
+ const block_q8_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
- const block_q8_0 * restrict vx0 = vx;
- const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
- const block_q8_0 * restrict vy0 = vy;
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+ const block_q8_0 * GGML_RESTRICT vx0 = vx;
+ const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
float32x4_t sumv0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
- const block_q8_0 * restrict b_x0 = &vx0[i];
- const block_q8_0 * restrict b_y0 = &vy0[i];
+ const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
- const block_q8_0 * restrict b_x1 = &vx1[i];
- const block_q8_0 * restrict b_y1 = &vy1[i];
+ const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
for (; ib + 1 < nb; ib += 2) {
- const block_q8_0 * restrict x0 = &x[ib + 0];
- const block_q8_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// load x
const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
{
//printf("sve256");
for (; ib + 1 < nb; ib += 2) {
- const block_q8_0 * restrict x0 = &x[ib + 0];
- const block_q8_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
// load x
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
svfloat32_t sumv00 = svdup_n_f32(0.0f);
for (; ib + 1 < nb; ib += 2) {
- const block_q8_0 * restrict x0 = &x[ib + 0];
- const block_q8_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
//load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
// and add them to make one 64 element vector
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (; ib + 1 < nb; ib += 2) {
- const block_q8_0 * restrict x0 = &x[ib + 0];
- const block_q8_0 * restrict x1 = &x[ib + 1];
- const block_q8_0 * restrict y0 = &y[ib + 0];
- const block_q8_0 * restrict y1 = &y[ib + 1];
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
const int8x16_t x0_0 = vld1q_s8(x0->qs);
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
v128_t sumv = wasm_f32x4_splat(0.0f);
for (; ib < nb; ++ib) {
- const block_q8_0 * restrict x0 = &x[ib];
- const block_q8_0 * restrict y0 = &y[ib];
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
const v128_t x0_0 = wasm_v128_load(x0->qs);
const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
*s = sumf;
}
-void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
- const block_tq1_0 * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_tq1_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
#endif
}
-void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
- const block_tq2_0 * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_tq2_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
#endif
}
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
- const block_q2_K * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_q2_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8_sv = y[i].qs;
- const uint8_t * restrict sc = x[i].scales;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8_sv = y[i].qs;
- const uint8_t * restrict sc = x[i].scales;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
- const uint8_t * restrict sc = x[i].scales;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
const uint8x16_t mins_and_scales = vld1q_u8(sc);
const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// load mins and scales from block_q2_K.scales[QK_K/16]
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
vector signed int vsumi6 = v0;
vector signed int vsumi7 = v0;
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/128; ++j) {
__builtin_prefetch(q2, 0, 1);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
#endif
}
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
- const block_q3_K * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_q3_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q3_sv = x[i].qs;
- const uint8_t * restrict qh_sv = x[i].hmask;
- const int8_t * restrict q8_sv = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
// Set up scales
memcpy(aux, x[i].scales, 12);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict qh = x[i].hmask;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q3 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Set up scales
memcpy(aux, x[i].scales, 12);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q3 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Set up scales
aux = (const uint32_t *)x[i].scales;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict hm = x[i].hmask;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Process blocks with SIMD
int8_t * a = aux8;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict qh = x[i].hmask;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(aux, x[i].scales, 12);
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
vector signed int vsumi6 = v0;
vector signed int vsumi7 = v0;
- const uint8_t * restrict q3 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/128; ++j) {
__builtin_prefetch(q3, 0, 1);
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q3 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Set up scales
memcpy(aux, x[i].scales, 12);
__m128i scales128 = lsx_set_w(
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict hm = x[i].hmask;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
- int8_t * restrict a = aux8;
+ int8_t * GGML_RESTRICT a = aux8;
uint8_t m = 1;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
}
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_q4_K * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_q4_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const uint8_t * scales = (const uint8_t *)utmp;
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const int vector_length = ggml_cpu_get_sve_cnt()*8;
const svuint8_t m4b = svdup_n_u8(0xf);
const uint8_t * scales = (const uint8_t *)utmp;
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t sumi1 = 0;
int32_t sumi2 = 0;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Process scales and mins
memcpy(utmp, x[i].scales, 12);
// Sum mins * q8sums
int32_t sumi = 0;
- const int16_t * restrict q8sums = y[i].bsums;
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
const uint8_t * m = (const uint8_t *)&utmp[2];
for (int j = 0; j < 16; j += 2) {
sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
utmp[2] = uaux;
utmp[0] &= kmask1;
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
vl = 32;
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/64; j+=2) {
__builtin_prefetch(q4, 0, 1);
utmp[2] = uaux;
utmp[0] &= kmask1;
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
const uint8_t * scales = (const uint8_t *)utmp;
- const uint8_t * restrict x0 = x[i].qs;
- const int8_t * restrict y0 = y[i].qs;
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
int32_t sumi1 = 0;
int32_t sumi2 = 0;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q4 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
- int8_t * restrict a = aux8;
+ int8_t * GGML_RESTRICT a = aux8;
for (int j = 0; j < QK_K/64; ++j) {
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
a += 32;
#endif
}
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_q5_K * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_q5_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const uint8_t * scales = (const uint8_t *)utmp;
- const uint8_t * restrict q5 = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
float summs = 0.f;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q5 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
- const uint8_t * restrict q5 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
- const uint8_t * restrict q5 = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Process scales and mins
memcpy(utmp, x[i].scales, 12);
// Sum mins * q8sums
int32_t sumi_mins = 0;
- const int16_t * restrict q8sums = y[i].bsums;
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
const uint8_t * m = (const uint8_t *)&utmp[2];
for (int j = 0; j < 16; j += 2) {
sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
vl = 8;
- const uint8_t * restrict q5 = x[i].qs;
- const uint8_t * restrict hm = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint8_t * restrict q5 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/64; ++j) {
__builtin_prefetch(q5, 0, 1);
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q5 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
const uint8_t * scales = (const uint8_t *)utmp;
- const uint8_t * restrict x0l = x[i].qs;
- const uint8_t * restrict x0h = x[i].qh;
- const int8_t * restrict y0 = y[i].qs;
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
v_xh[0] = vec_xl(0 , x0h);
v_xh[1] = vec_xl(16, x0h);
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q4 = x[i].qs;
- const uint8_t * restrict hm = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
- int8_t * restrict a = aux8;
+ int8_t * GGML_RESTRICT a = aux8;
uint8_t m = 1;
for (int j = 0; j < QK_K/64; ++j) {
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
#endif
}
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_q6_K * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_q6_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float d_all = GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q6 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
- const int8_t * restrict scale = x[i].scales;
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
const int8x16_t scales = vld1q_s8(scale);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q4 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q4 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
// handle the q6_k -32 offset separately using bsums
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
for (int i = 0; i < nb; ++i) {
// Unpack 6-bit quantized data into aux8 (unchanged)
- const uint8_t * restrict q4 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
int8_t * a = aux8;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
qh += 32;
}
- const int8_t * restrict a_ptr = aux8;
- const int8_t * restrict q8 = y[i].qs;
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
v128_t acc0 = wasm_i32x4_splat(0);
v128_t acc1 = wasm_i32x4_splat(0);
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q6 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
- const int8_t * restrict scale = x[i].scales;
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
size_t vl;
vector signed int vsumi6 = v0;
vector signed int vsumi7 = v0;
- const uint8_t * restrict q6 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict qs = x[i].scales;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT qs = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/128; ++j) {
__builtin_prefetch(q6, 0, 0);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q4 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict x0l = x[i].ql;
- const uint8_t * restrict x0h = x[i].qh;
- const int8_t * restrict y0 = y[i].qs;
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
- const int8_t * restrict scale = x[i].scales;
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
float sumf = 0;
for (int i = 0; i < nb; ++i) {
- const uint8_t * restrict q4 = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
- int8_t * restrict a = aux8;
+ int8_t * GGML_RESTRICT a = aux8;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
};
#endif
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq2_xxs * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
float sumf1 = 0, sumf2 = 0;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/32; j += 2) {
__builtin_prefetch(q2, 0, 1);
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = __lasx_xvldi(0);
__m256i sumi2 = __lasx_xvldi(0);
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
//
// for (int i = 0; i < nb; ++i) {
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-// const uint16_t * restrict q2 = x[i].qs;
-// const int8_t * restrict q8 = y[i].qs;
+// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+// const int8_t * GGML_RESTRICT q8 = y[i].qs;
//
// float sumf1 = 0, sumf2 = 0;
//
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
memcpy(aux32, q2, 2*sizeof(uint32_t));
#endif
}
-void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq2_xs * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq2_xs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
const uint8x8_t scales8 = vld1_u8(x[i].scales);
const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
__m128i stmp = _mm_set1_epi64x(aux64);
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
__m128i stmp = _mm_set1_epi64x(aux64);
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
__m128i stmp = __lsx_vreplgr2vr_d(aux64);
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint16_t * restrict q2 = x[i].qs;
- const uint8_t * restrict sc = x[i].scales;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/64; ++j) {
__builtin_prefetch(q2, 0, 1);
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint16_t * restrict q2 = x[i].qs;
- const uint8_t * restrict sc = x[i].scales;
- const int8_t * restrict q8 = y[i].qs;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
#endif
}
-void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq2_s * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq2_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int sumi1 = 0, sumi2 = 0;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint8_t * restrict q2 = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
- const uint8_t * restrict sc = x[i].scales;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
for (int j = 0; j < QK_K/32; j += 2) {
__builtin_prefetch(q2, 0, 1);
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m128i tmp1;
memcpy(&aux64, x[i].scales, 8);
}
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq3_xxs * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
float sumf1 = 0, sumf2 = 0;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
vector signed int vsumi2 = v0;
vector signed int vsumi3 = v0;
- const uint8_t * restrict q3 = x[i].qs;
- const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
#pragma GCC unroll 1
for (int j = 0; j < QK_K/32; j += 2) {
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = __lasx_xvldi(0);
__m256i sumi2 = __lasx_xvldi(0);
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
#endif
}
-void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq3_s * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq3_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
memcpy(scales32, x[i].scales, 4);
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
- const uint8_t * restrict q3 = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
- const uint8_t * restrict sc = x[i].scales;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
vector signed int vsumi0 = v0;
vector signed int vsumi1 = v0;
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = __lasx_xvldi(0);
__m256i sumi2 = __lasx_xvldi(0);
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
- const uint8_t * restrict qs = x[i].qs;
- const uint8_t * restrict qh = x[i].qh;
- const uint8_t * restrict signs = x[i].signs;
- const int8_t * restrict q8 = y[i].qs;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
}
#endif
-void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq1_s * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq1_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
vector signed int vsumi3 = vec_splats((int32_t)0);
vector signed int vsumi8 = vec_splats((int32_t)0);
- const uint8_t * restrict q1 = x[i].qs;
- const uint16_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
- const int16_t * restrict qs = y[i].bsums;
+ const uint8_t * GGML_RESTRICT q1 = x[i].qs;
+ const uint16_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ const int16_t * GGML_RESTRICT qs = y[i].bsums;
for (int j = 0; j < QK_K/32; j += 2) {
__builtin_prefetch(q1, 0, 1);
#endif
}
-void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(by);
UNUSED(bs);
- const block_iq1_m * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq1_m * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
#endif
}
-void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
assert(n % QK4_NL == 0);
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
- const block_iq4_nl * restrict x = vx;
- const block_q8_0 * restrict y = vy;
+ const block_iq4_nl * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK4_NL;
const uint8x16_t v_m = vec_splat_u8(0x0F);
for (; ib < nb; ++ib) {
- const block_iq4_nl * restrict x0 = &x[ib];
- const block_q8_0 * restrict y0 = &y[ib];
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
const uint8x16_t v_x = vec_xl(0, x0->qs);
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
*s = sumf;
}
-void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(bs);
assert(n % QK_K == 0);
- const block_iq4_xs * restrict x = vx;
- const block_q8_K * restrict y = vy;
+ const block_iq4_xs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
uint16_t h = x[ibl].scales_h;
- const uint8_t * restrict q4 = x[ibl].qs;
- const uint8_t * restrict sc = x[ibl].scales_l;
- const int8_t * restrict q8 = y[ibl].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+ const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
for (int ib = 0; ib < QK_K/64; ib ++ ) {
__builtin_prefetch(q4, 0, 1);
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
- const uint8_t * restrict q4 = x[ibl].qs;
- const int8_t * restrict q8 = y[ibl].qs;
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
uint16_t h = x[ibl].scales_h;
// ============================ 4-bit non-linear quants
-void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
assert(k % QK4_NL == 0);
quantize_row_iq4_nl_ref(x, y, k);
}
-void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq4_xs(x, y, 1, k, NULL);
}
#define UNUSED GGML_UNUSED
// reference implementation for deterministic creation of model files
-void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
static const int qk = QK4_0;
assert(k % qk == 0);
}
}
-void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
const int qk = QK4_1;
assert(k % qk == 0);
}
}
-void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
static const int qk = QK5_0;
assert(k % qk == 0);
}
}
-void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
const int qk = QK5_1;
assert(k % qk == 0);
}
// reference implementation for deterministic creation of model files
-void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
}
// reference implementation for deterministic creation of model files
-void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
assert(QK8_1 == 32);
assert(k % QK8_1 == 0);
const int nb = k / QK8_1;
}
}
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK4_0;
assert(k % qk == 0);
}
}
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK4_1;
assert(k % qk == 0);
}
}
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK5_0;
assert(k % qk == 0);
}
}
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK5_1;
assert(k % qk == 0);
}
}
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
static const int qk = QK8_0;
assert(k % qk == 0);
return (i & 0x007fffff) - 0x00400000;
}
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
- const float * restrict qw) {
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
+ const float * GGML_RESTRICT qw) {
float max = 0;
float amax = 0;
for (int i = 0; i < n; ++i) {
return scale;
}
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
float max = 0;
float amax = 0;
for (int i = 0; i < n; ++i) {
return 1/iscale;
}
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
int ntry, float alpha) {
float min = x[0];
float max = x[0];
return scale;
}
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
float rmin, float rdelta, int nstep, bool use_mad) {
float min = x[0];
float max = x[0];
return scale;
}
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
if (j < 4) {
*d = q[j] & 63; *m = q[j + 4] & 63;
} else {
//========================- 2-bit (de)-quantization
-void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
}
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
}
-static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
float rmin, float rdelta, int nstep, bool use_mad) {
float min = x[0];
float max = x[0];
return scale;
}
-static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
+static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
float max = 0;
for (int i = 0; i < n; ++i) {
max = MAX(max, x[i]);
return sumlx/suml2;
}
-static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
+static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
GGML_ASSERT(quant_weights);
assert(k % QK_K == 0);
const int nb = k / QK_K;
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
float sigma2 = sumx2/QK_K;
for (int j = 0; j < QK_K/16; ++j) {
- const float * restrict qw = quant_weights + QK_K * i + 16*j;
+ const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
}
}
-size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
if (!quant_weights) {
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
//========================= 3-bit (de)-quantization
-void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
}
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
const float d_all = GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict q = x[i].qs;
- const uint8_t * restrict hm = x[i].hmask;
+ const uint8_t * GGML_RESTRICT q = x[i].qs;
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
uint8_t m = 1;
memcpy(aux, x[i].scales, 12);
}
}
-static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
+static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
assert(n_per_row % QK_K == 0);
const int nb = n_per_row / QK_K;
}
}
-size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
if (!quant_weights) {
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
// ====================== 4-bit (de)-quantization
-void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
}
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
}
}
-static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
assert(n_per_row % QK_K == 0);
const int64_t nb = n_per_row / QK_K;
}
}
-size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
if (!quant_weights) {
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
// ====================== 5-bit (de)-quantization
-void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
- uint8_t * restrict qh = y[i].qh;
- uint8_t * restrict ql = y[i].qs;
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
memset(qh, 0, QK_K/8);
uint8_t m1 = 1, m2 = 2;
}
}
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
assert(n_per_row % QK_K == 0);
const int64_t nb = n_per_row / QK_K;
}
}
- uint8_t * restrict qh = y[i].qh;
- uint8_t * restrict ql = y[i].qs;
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
memset(qh, 0, QK_K/8);
uint8_t m1 = 1, m2 = 2;
}
}
-size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
if (!quant_weights) {
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
// ====================== 6-bit (de)-quantization
-void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
- uint8_t * restrict ql = y[i].ql;
- uint8_t * restrict qh = y[i].qh;
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
const uint8_t q1 = L[j + l + 0] & 0xF;
}
}
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
- const uint8_t * restrict ql = x[i].ql;
- const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict sc = x[i].scales;
+ const uint8_t * GGML_RESTRICT ql = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT sc = x[i].scales;
for (int n = 0; n < QK_K; n += 128) {
for (int l = 0; l < 32; ++l) {
}
}
-static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
assert(n_per_row % QK_K == 0);
const int64_t nb = n_per_row / QK_K;
}
}
- uint8_t * restrict ql = y[i].ql;
- uint8_t * restrict qh = y[i].qh;
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
const uint8_t q1 = L[j + l + 0] & 0xF;
}
}
-size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
if (!quant_weights) {
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * row_size;
}
-static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
static_assert(QK4_0 == 32, "QK4_0 must be 32");
if (!quant_weights) {
}
}
-size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
return nrow * row_size;
}
-static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
static_assert(QK4_1 == 32, "QK4_1 must be 32");
if (!quant_weights) {
}
}
-size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
return nrow * row_size;
}
-static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
static_assert(QK5_0 == 32, "QK5_0 must be 32");
if (!quant_weights) {
}
}
-size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
return nrow * row_size;
}
-static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
static_assert(QK5_1 == 32, "QK5_1 must be 32");
if (!quant_weights) {
}
}
-size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
return nrow * row_size;
}
-size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
(void)quant_weights; // not used
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
-void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
+void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
+void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
(void)quant_weights; // not used
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * row_size;
}
-size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
(void)quant_weights; // not used
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * row_size;
}
-void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== "True" 2-bit (de)-quantization
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== 2.3125 bpw (de)-quantization
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== 2.5625 bpw (de)-quantization
-void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== 3.0625 bpw (de)-quantization
-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== 3.3125 bpw (de)-quantization
-void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
// ====================== 1.5625 bpw (de)-quantization
-void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK4_NL == 0);
const int64_t nb = k / QK4_NL;
}
}
-void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
//===================================== Q8_K ==============================================
-void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
const int64_t nb = k / QK_K;
}
}
-static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_d2 = FLT_MAX;
return grid_index;
}
-static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
}
}
-static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
}
}
-size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq2_xxs);
}
-size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
}
}
-static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_d2 = FLT_MAX;
return grid_index;
}
-static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
- const float * restrict quant_weights) {
+static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
+ const float * GGML_RESTRICT quant_weights) {
const int gindex = iq3_data_index(grid_size);
}
}
-size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq3_xxs);
}
-void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
}
-static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
- const float * restrict quant_weights,
+static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
+ const float * GGML_RESTRICT quant_weights,
float * scales,
float * weight,
float * xval,
}
#define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
float scales[QK_K/IQ3S_BLOCK_SIZE];
return nrow * nblock * sizeof(block_iq3_s);
}
-void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
+void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq3_s(x, y, 1, k, NULL);
}
// =================================== 1.5 bpw ===================================================
-static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
- const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
+static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_score = -FLT_MAX;
return grid_index;
}
-static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
- const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
+static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_score = FLT_MAX;
#define IQ1S_BLOCK_SIZE 32
#define IQ1M_BLOCK_SIZE 16
-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
float * scales,
float * weight,
float * sumx,
}
}
-size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
float scales[QK_K/IQ1S_BLOCK_SIZE];
float weight[IQ1S_BLOCK_SIZE];
return nrow * nblock * sizeof(block_iq1_s);
}
-static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
float * scales,
float * weight,
float * pairs,
}
}
-size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
float scales[QK_K/IQ1M_BLOCK_SIZE];
float weight[IQ1M_BLOCK_SIZE];
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
}
-static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
+static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
float * scales, float * weight, uint8_t * L,
const int8_t * values,
}
}
-size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK4_NL == 0);
int64_t nblock = n_per_row/QK4_NL;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq4_nl);
}
-//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
-void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
+//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
GGML_ASSERT(k%QK4_NL == 0);
int64_t nblock = k/QK4_NL;
uint8_t L[QK4_NL];
}
}
-size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq4_xs);
}
-void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
+void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq4_xs(x, y, 1, k, NULL);
}
// =============================== 2.5625 bpw
-static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
}
}
-size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq2_s);
}
-void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
+void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
quantize_iq2_s(x, y, 1, k, NULL);
}