#define UNUSED GGML_UNUSED
+void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+ assert(QK8_0 == 32);
+ assert(k % QK8_0 == 0);
+ const int nb = k / QK8_0;
+
+#if defined(__riscv_v_intrinsic)
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+ const size_t vl_calc = __riscv_vsetvl_e32m8(QK8_0);
+ const size_t vl_save = __riscv_vsetvl_e64m2(4);
+ vfloat32m1_t v_scalar_zero = __riscv_vfmv_s_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1));
+
+ for (int i = 0; i < nb; i++) {
+ const float *x_block_base = x + i * QK8_0;
+ vint8m2_t q_r0, q_r1, q_r2, q_r3;
+ {
+ vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 0 * k, vl_calc);
+ vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
+ vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
+
+ float d = amax / 127.0f;
+ y[i].d[0] = GGML_CPU_FP32_TO_FP16(d);
+
+ float id = d ? 1.0f / d : 0.0f;
+ vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
+ vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
+ q_r0 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
+ }
+ asm volatile ("" ::: "memory");
+
+ {
+ vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 1 * k, vl_calc);
+ vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
+ vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
+
+ float d = amax / 127.0f;
+ y[i].d[1] = GGML_CPU_FP32_TO_FP16(d);
+ float id = d ? 1.0f / d : 0.0f;
+
+ vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
+ vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
+ q_r1 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
+ }
+ asm volatile ("" ::: "memory");
+ {
+ vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 2 * k, vl_calc);
+ vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
+ vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
+
+ float d = amax / 127.0f;
+ y[i].d[2] = GGML_CPU_FP32_TO_FP16(d);
+ float id = d ? 1.0f / d : 0.0f;
+
+ vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
+ vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
+ q_r2 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
+ }
+ asm volatile ("" ::: "memory");
+ {
+ vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 3 * k, vl_calc);
+ vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
+ vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
+
+ float d = amax / 127.0f;
+ y[i].d[3] = GGML_CPU_FP32_TO_FP16(d);
+ float id = d ? 1.0f / d : 0.0f;
+
+ vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
+ vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
+ q_r3 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
+ }
+ vint64m2_t v_q64_r0 = __riscv_vreinterpret_v_i8m2_i64m2(q_r0);
+ vint64m2_t v_q64_r1 = __riscv_vreinterpret_v_i8m2_i64m2(q_r1);
+ vint64m2_t v_q64_r2 = __riscv_vreinterpret_v_i8m2_i64m2(q_r2);
+ vint64m2_t v_q64_r3 = __riscv_vreinterpret_v_i8m2_i64m2(q_r3);
+ vint64m2x4_t v_quant_tuple = __riscv_vcreate_v_i64m2x4(v_q64_r0, v_q64_r1, v_q64_r2, v_q64_r3);
+ __riscv_vsseg4e64_v_i64m2x4((int64_t*)y[i].qs, v_quant_tuple, vl_save);
+ }
+#else
+ UNUSED(nb);
+ UNUSED(y);
+ ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
+#endif
+}
+
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
+void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
+
+ // 1x16 Accumulator
+ vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 1x16 Integer Accumulator
+ vint16m1_t sumi_0_lo_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_0_hi_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vint8mf2_t b_0_packed = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
+ const vint8mf2_t b_0_lo = __riscv_vsra_vx_i8mf2(__riscv_vsll_vx_i8mf2(b_0_packed, 4, 16), 4, 16);
+ const vint8mf2_t b_0_hi = __riscv_vsra_vx_i8mf2(b_0_packed, 4, 16);
+
+ sumi_0_lo_16 = __riscv_vwmacc_vx_i16m1(sumi_0_lo_16, a_ptr[l].qs[i], b_0_lo, 16);
+ sumi_0_hi_16 = __riscv_vwmacc_vx_i16m1(sumi_0_hi_16, a_ptr[l].qs[16 + i], b_0_hi, 16);
+ }
+
+ const vint32m2_t sumi = __riscv_vwadd_vv_i32m2(sumi_0_lo_16, sumi_0_hi_16, 16);
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d, 16);
+
+ sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
+ }
+ return;
+#endif
+ ggml_gemv_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK_K;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
+
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
+
+ // 1x16 Accumulator
+ vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0, 16);
+
+ // Load `dmin`.
+ const vfloat32m2_t dmins_d = __riscv_vfmul_vf_f32m2(
+ __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].dmin, 16), 16), a_ptr[l].d, 16);
+
+ // We process 4 sub-blocks at once.
+ for (int j = 0; j < QK_K / 128; j++) {
+ // Extract the scales and the mins.
+ //
+ // Low bits.
+ vuint8m2_t scales_mins_lo = __riscv_vle8_v_u8m2(&b_ptr[l].scales[j * 64], 64);
+ vuint8m2_t scales_lo = __riscv_vand_vx_u8m2(scales_mins_lo, 0x0F, 64);
+ vuint8m2_t mins_lo = __riscv_vsrl_vx_u8m2(scales_mins_lo, 4, 64);
+
+ // High bits.
+ vuint8m2_t scales_mins_hi = __riscv_vle8_v_u8m2(&b_ptr[l].scales[128], 64);
+ vuint8m2_t scales_hi;
+ vuint8m2_t mins_hi;
+ if (!j) {
+ scales_hi = __riscv_vsll_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0x03, 64), 4, 64);
+ mins_hi = __riscv_vsll_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0x0C, 64), 2, 64);
+ } else {
+ scales_hi = __riscv_vand_vx_u8m2(scales_mins_hi, 0x30, 64);
+ mins_hi = __riscv_vsrl_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0xC0, 64), 2, 64);
+ }
+ vuint16m4_t scales = __riscv_vzext_vf2_u16m4(__riscv_vor_vv_u8m2(scales_hi, scales_lo, 64), 64);
+ vint16m4_t mins = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vzext_vf2_u16m4(__riscv_vor_vv_u8m2(mins_hi, mins_lo, 64), 64));
+
+ // Reduce the mins and multiply with `dmin`.
+ //
+ // Correct in `sumf`.
+ vint32m2_t bsums = __riscv_vmv_v_x_i32m2(0, 16);
+ bsums = __riscv_vwmacc_vx_i32m2(bsums, a_ptr[l].bsums[j * 8] + a_ptr[l].bsums[j * 8 + 1], __riscv_vget_v_i16m4_i16m1(mins, 0), 16);
+ bsums = __riscv_vwmacc_vx_i32m2(bsums, a_ptr[l].bsums[j * 8 + 2] + a_ptr[l].bsums[j * 8 + 3], __riscv_vget_v_i16m4_i16m1(mins, 1), 16);
+ bsums = __riscv_vwmacc_vx_i32m2(bsums, a_ptr[l].bsums[j * 8 + 4] + a_ptr[l].bsums[j * 8 + 5], __riscv_vget_v_i16m4_i16m1(mins, 2), 16);
+ bsums = __riscv_vwmacc_vx_i32m2(bsums, a_ptr[l].bsums[j * 8 + 6] + a_ptr[l].bsums[j * 8 + 7], __riscv_vget_v_i16m4_i16m1(mins, 3), 16);
+
+ sumf = __riscv_vfsub_vv_f32m2(sumf, __riscv_vfmul_vv_f32m2(dmins_d, __riscv_vfcvt_f_x_v_f32m2(bsums, 16), 16), 16);
+
+ // Accumulation for 2 sub-blocks.
+ //
+ // This might overflow, so we accumulate in two steps.
+ //
+ // Recheck.
+ for (int k = 0; k < 2; k++) {
+ vint16m1_t sumi_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 1024 + i * 16], 16);
+ const vint8mf2_t b_s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_0_packed, 0xF, 16));
+ const vint8mf2_t b_s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16));
+
+ sumi_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_s_0_16, a_ptr[l].qs[j * 128 + i], b_s_0, 16);
+ sumi_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_s_1_16, a_ptr[l].qs[j * 128 + 32 + i], b_s_1, 16);
+ }
+
+ sumi = __riscv_vwmacc_vv_i32m2(sumi,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 0)),
+ sumi_s_0_16, 16);
+ sumi = __riscv_vwmacc_vv_i32m2(sumi,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 1)),
+ sumi_s_1_16, 16);
+ }
+ // Accumulation for 2 sub-blocks.
+ //
+ // This might overflow, so we accumulate in two steps.
+ //
+ // Recheck.
+ for (int k = 0; k < 2; k++) {
+ vint16m1_t sumi_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 1024 + 512 + i * 16], 16);
+ const vint8mf2_t b_s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_0_packed, 0xF, 16));
+ const vint8mf2_t b_s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16));
+
+ sumi_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_s_0_16, a_ptr[l].qs[j * 128 + 64 + i], b_s_0, 16);
+ sumi_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_s_1_16, a_ptr[l].qs[j * 128 + 96 + i], b_s_1, 16);
+ }
+
+ sumi = __riscv_vwmacc_vv_i32m2(sumi,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 2)),
+ sumi_s_0_16, 16);
+ sumi = __riscv_vwmacc_vv_i32m2(sumi,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 3)),
+ sumi_s_1_16, 16);
+ }
+ }
+
+ const vfloat32m2_t b_d = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1((const _Float16 *)&b_ptr[l].d[0], 16), 16);
+ const vfloat32m2_t d_0 = __riscv_vfmul_vf_f32m2(b_d, a_ptr[l].d, 16);
+
+ sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
+ }
+ return;
+#endif
+ ggml_gemv_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16);
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
+
+ // 1x16 Accumulator1
+ vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 1x16 integer accumulator
+ vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK4_NL / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2((const uint8_t *)&b_ptr[l].qs[i * 16], 16);
+ const vint8mf2_t b_0_lo = __riscv_vrgather_vv_i8mf2(values, __riscv_vand_vx_u8mf2(b_0_packed, 0xf, 16), 16);
+ const vint8mf2_t b_0_hi = __riscv_vrgather_vv_i8mf2(values, __riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16), 16);
+ // const vint16m1_t b_0_lo_16 = __riscv_vwcvt_x_x_v_i16m1(b_0_lo, 16);
+ // const vint16m1_t b_0_hi_16 = __riscv_vwcvt_x_x_v_i16m1(b_0_hi, 16);
+
+ const vint16m1_t sumi_lo = __riscv_vwmul_vx_i16m1(b_0_lo, a_ptr[l].qs[i], 16);
+ const vint16m1_t sumi_hi = __riscv_vwmul_vx_i16m1(b_0_hi, a_ptr[l].qs[16 + i], 16);
+ sumi = __riscv_vadd_vv_i32m2(sumi, __riscv_vwadd_vv_i32m2(sumi_lo, sumi_hi, 16), 16);
+ }
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d, 16);
+
+ sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
+ }
+ return;
+#endif
+ ggml_gemv_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+ UNUSED(bs);
+
+#if defined __riscv_v_intrinsic
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
+
+ // 1x16 Accumulator
+ vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 1x16 Integer Accumulator
+ vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK8_0; i++) {
+ // Load `b_ptr`.
+ const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
+ // const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
+
+ sumi = __riscv_vwadd_wv_i32m2(sumi, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i], 16), 16);
+ }
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d, 16);
+
+ sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
+ }
+ return;
+#endif
+ ggml_gemv_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ assert(n % QK_K == 0);
+ assert(nr == 1);
+ assert(nc % 16 == 0);
+
+ UNUSED(bs);
+
+ const int N_COLS_TILE = 16;
+ const int num_k_blocks = n / QK_K;
+
+ const size_t vl = __riscv_vsetvl_e32m2(N_COLS_TILE);
+ for (int col_tile = 0; col_tile < nc; col_tile += N_COLS_TILE) {
+
+ const block_q8_K* lhs_base_ptr = (const block_q8_K*)vy;
+ const block_q2_Kx16* rhs_base_ptr = (const block_q2_Kx16*)vx + (col_tile / N_COLS_TILE) * num_k_blocks;
+
+ vfloat32m2_t v_sumf = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+
+ for (int k_block = 0; k_block < num_k_blocks; ++k_block) {
+ const block_q8_K* lhs_current = &lhs_base_ptr[k_block];
+ const block_q2_Kx16* rhs_current = &rhs_base_ptr[k_block];
+
+ // 1. Prepare Global Min Scales
+ vfloat16m1_t v_g_min_f16 = __riscv_vle16_v_f16m1((const _Float16*)rhs_current->dmin, vl);
+ vfloat32m2_t v_g_min_base = __riscv_vfwcvt_f_f_v_f32m2(v_g_min_f16, vl);
+
+ vfloat32m2_t v_g_min_final = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d, vl);
+
+ vint32m2_t v_isum = __riscv_vmv_v_x_i32m2(0, vl);
+
+ const uint8_t* rhs_qs_ptr = rhs_current->qs;
+ const uint8_t* rhs_sc_ptr = rhs_current->scales;
+ const int8_t* lhs_qs_ptr = lhs_current->qs;
+
+ // --- Phase Loop (4 phases x 64 elements) ---
+ for (int phase = 0; phase < 4; ++phase) {
+
+ // A. Load Scales/Mins
+ vuint16m1_t v_d_sb_0, v_d_sb_1, v_d_sb_2, v_d_sb_3;
+ vuint16m1_t v_m_sb_0, v_m_sb_1, v_m_sb_2, v_m_sb_3;
+
+ {
+ vuint8mf2_t v_raw;
+ // Sub-block 0
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 0, vl);
+ v_d_sb_0 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_0 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 1
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 16, vl);
+ v_d_sb_1 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_1 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 2
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 32, vl);
+ v_d_sb_2 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_2 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 3
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 48, vl);
+ v_d_sb_3 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_3 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ rhs_sc_ptr += 64;
+ }
+
+ int base_k_phase = (phase < 2) ? (phase * 16) : (128 + (phase-2)*16);
+ int k_offsets[4] = {0, 32, 64, 96};
+
+ // B. Inner Dot Product Loop
+ for (int l = 0; l < 16; ++l) {
+ vuint8mf2_t v_rhs_data = __riscv_vle8_v_u8mf2(rhs_qs_ptr, vl);
+ rhs_qs_ptr += 16;
+
+ // Sub-block 0
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(v_rhs_data, 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_0), vl);
+
+ int8_t q8 = lhs_qs_ptr[base_k_phase + k_offsets[0] + l];
+ v_isum = __riscv_vwmacc_vx_i32m2(v_isum, (int16_t)q8, v_w, vl);
+ }
+ // Sub-block 1
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 2, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_1), vl);
+
+ int8_t q8 = lhs_qs_ptr[base_k_phase + k_offsets[1] + l];
+ v_isum = __riscv_vwmacc_vx_i32m2(v_isum, (int16_t)q8, v_w, vl);
+ }
+ // Sub-block 2
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 4, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_2), vl);
+
+ int8_t q8 = lhs_qs_ptr[base_k_phase + k_offsets[2] + l];
+ v_isum = __riscv_vwmacc_vx_i32m2(v_isum, (int16_t)q8, v_w, vl);
+ }
+ // Sub-block 3
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 6, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_3), vl);
+
+ int8_t q8 = lhs_qs_ptr[base_k_phase + k_offsets[3] + l];
+ v_isum = __riscv_vwmacc_vx_i32m2(v_isum, (int16_t)q8, v_w, vl);
+ }
+ }
+
+ // correction
+ int sb_base_abs = base_k_phase / 16;
+
+ // Sub-block 0
+ {
+ int sb_idx = sb_base_abs + (k_offsets[0] / 16);
+ int16_t bsum = lhs_current->bsums[sb_idx];
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_0);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, bsum, vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min_final, vl);
+ v_sumf = __riscv_vfsub_vv_f32m2(v_sumf, vf_c, vl);
+ }
+ // Sub-block 1
+ {
+ int sb_idx = sb_base_abs + (k_offsets[1] / 16);
+ int16_t bsum = lhs_current->bsums[sb_idx];
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_1);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, bsum, vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min_final, vl);
+ v_sumf = __riscv_vfsub_vv_f32m2(v_sumf, vf_c, vl);
+ }
+ // Sub-block 2
+ {
+ int sb_idx = sb_base_abs + (k_offsets[2] / 16);
+ int16_t bsum = lhs_current->bsums[sb_idx];
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_2);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, bsum, vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min_final, vl);
+ v_sumf = __riscv_vfsub_vv_f32m2(v_sumf, vf_c, vl);
+ }
+ // Sub-block 3
+ {
+ int sb_idx = sb_base_abs + (k_offsets[3] / 16);
+ int16_t bsum = lhs_current->bsums[sb_idx];
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_3);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, bsum, vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min_final, vl);
+ v_sumf = __riscv_vfsub_vv_f32m2(v_sumf, vf_c, vl);
+ }
+
+ } // End Phase Loop
+
+ // Apply global Scales
+ vfloat16m1_t v_g_all_f16 = __riscv_vle16_v_f16m1((const _Float16*)rhs_current->d, vl);
+ vfloat32m2_t v_g_all_base = __riscv_vfwcvt_f_f_v_f32m2(v_g_all_f16, vl);
+
+ vfloat32m2_t v_g_all_final = __riscv_vfmul_vf_f32m2(v_g_all_base, lhs_current->d, vl);
+ vfloat32m2_t v_sum = __riscv_vfcvt_f_x_v_f32m2(v_isum, vl);
+ v_sum = __riscv_vfmul_vv_f32m2(v_sum, v_g_all_final, vl);
+ v_sumf = __riscv_vfadd_vv_f32m2(v_sumf, v_sum, vl);
+
+ } // End K-Block
+ __riscv_vse32_v_f32m2(s + col_tile, v_sumf, vl);
+
+ }
+}
+
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
#endif
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
+
+void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
+
+ // 4x16 Accumulators
+ vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 4x16 integer accumulators
+ vint16m1_t sumi_0_lo_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_lo_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_lo_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_lo_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_0_hi_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_hi_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_hi_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_hi_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vint8mf2_t b_0_packed = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
+ const vint8mf2_t b_0_lo = __riscv_vsra_vx_i8mf2(__riscv_vsll_vx_i8mf2(b_0_packed, 4, 16), 4, 16);
+ const vint8mf2_t b_0_hi = __riscv_vsra_vx_i8mf2(b_0_packed, 4, 16);
+
+ sumi_0_lo_16 = __riscv_vwmacc_vx_i16m1(sumi_0_lo_16, a_ptr[l].qs[i * 4], b_0_lo, 16);
+ sumi_1_lo_16 = __riscv_vwmacc_vx_i16m1(sumi_1_lo_16, a_ptr[l].qs[i * 4 + 1], b_0_lo, 16);
+ sumi_2_lo_16 = __riscv_vwmacc_vx_i16m1(sumi_2_lo_16, a_ptr[l].qs[i * 4 + 2], b_0_lo, 16);
+ sumi_3_lo_16 = __riscv_vwmacc_vx_i16m1(sumi_3_lo_16, a_ptr[l].qs[i * 4 + 3], b_0_lo, 16);
+
+ sumi_0_hi_16 = __riscv_vwmacc_vx_i16m1(sumi_0_hi_16, a_ptr[l].qs[64 + i * 4], b_0_hi, 16);
+ sumi_1_hi_16 = __riscv_vwmacc_vx_i16m1(sumi_1_hi_16, a_ptr[l].qs[64 + i * 4 + 1], b_0_hi, 16);
+ sumi_2_hi_16 = __riscv_vwmacc_vx_i16m1(sumi_2_hi_16, a_ptr[l].qs[64 + i * 4 + 2], b_0_hi, 16);
+ sumi_3_hi_16 = __riscv_vwmacc_vx_i16m1(sumi_3_hi_16, a_ptr[l].qs[64 + i * 4 + 3], b_0_hi, 16);
+ }
+
+ // Do the final accumulation in i32 to prevent overflow.
+ const vint32m2_t sumi_0 = __riscv_vwadd_vv_i32m2(sumi_0_lo_16, sumi_0_hi_16, 16);
+ const vint32m2_t sumi_1 = __riscv_vwadd_vv_i32m2(sumi_1_lo_16, sumi_1_hi_16, 16);
+ const vint32m2_t sumi_2 = __riscv_vwadd_vv_i32m2(sumi_2_lo_16, sumi_2_hi_16, 16);
+ const vint32m2_t sumi_3 = __riscv_vwadd_vv_i32m2(sumi_3_lo_16, sumi_3_hi_16, 16);
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[0], 16);
+ const vfloat32m2_t d_1 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[1], 16);
+ const vfloat32m2_t d_2 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[2], 16);
+ const vfloat32m2_t d_3 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[3], 16);
+
+ sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
+ sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
+ sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
+ sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
+ }
+ }
+ return;
+#endif
+ ggml_gemm_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK_K;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
+
+ // 4x16 Accumulators
+ vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ vint32m2_t sumi_0 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t sumi_1 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t sumi_2 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t sumi_3 = __riscv_vmv_v_x_i32m2(0, 16);
+
+ // Load `dmin`.
+ const vfloat32m2_t dmins = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].dmin, 16), 16);
+
+ // We process 4 sub-blocks at once.
+ for (int j = 0; j < QK_K / 128; j++) {
+ // Extract the scales and the mins.
+ //
+ // Low bits.
+ vuint8m2_t scales_mins_lo = __riscv_vle8_v_u8m2(&b_ptr[l].scales[j * 64], 64);
+ vuint8m2_t scales_lo = __riscv_vand_vx_u8m2(scales_mins_lo, 0x0F, 64);
+ vuint8m2_t mins_lo = __riscv_vsrl_vx_u8m2(scales_mins_lo, 4, 64);
+
+ // High bits.
+ vuint8m2_t scales_mins_hi = __riscv_vle8_v_u8m2(&b_ptr[l].scales[128], 64);
+ vuint8m2_t scales_hi;
+ vuint8m2_t mins_hi;
+ if (!j) {
+ scales_hi = __riscv_vsll_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0x03, 64), 4, 64);
+ mins_hi = __riscv_vsll_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0x0C, 64), 2, 64);
+ } else {
+ scales_hi = __riscv_vand_vx_u8m2(scales_mins_hi, 0x30, 64);
+ mins_hi = __riscv_vsrl_vx_u8m2(__riscv_vand_vx_u8m2(scales_mins_hi, 0xC0, 64), 2, 64);
+ }
+ vuint16m4_t scales = __riscv_vzext_vf2_u16m4(__riscv_vor_vv_u8m2(scales_hi, scales_lo, 64), 64);
+ vint16m4_t mins = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vzext_vf2_u16m4(__riscv_vor_vv_u8m2(mins_hi, mins_lo, 64), 64));
+
+ // Reduce the mins and multiply with `dmin`.
+ //
+ // Correct in `sumf`.
+ vint32m2_t bsums_0 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t bsums_1 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t bsums_2 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t bsums_3 = __riscv_vmv_v_x_i32m2(0, 16);
+
+ bsums_0 = __riscv_vwmacc_vx_i32m2(bsums_0,
+ a_ptr[l].bsums[j * 32] + a_ptr[l].bsums[j * 32 + 4],
+ __riscv_vget_v_i16m4_i16m1(mins, 0), 16);
+ bsums_1 = __riscv_vwmacc_vx_i32m2(bsums_1,
+ a_ptr[l].bsums[j * 32 + 1] + a_ptr[l].bsums[j * 32 + 5],
+ __riscv_vget_v_i16m4_i16m1(mins, 0), 16);
+ bsums_2 = __riscv_vwmacc_vx_i32m2(bsums_2,
+ a_ptr[l].bsums[j * 32 + 2] + a_ptr[l].bsums[j * 32 + 6],
+ __riscv_vget_v_i16m4_i16m1(mins, 0), 16);
+ bsums_3 = __riscv_vwmacc_vx_i32m2(bsums_3,
+ a_ptr[l].bsums[j * 32 + 3] + a_ptr[l].bsums[j * 32 + 7],
+ __riscv_vget_v_i16m4_i16m1(mins, 0), 16);
+ bsums_0 = __riscv_vwmacc_vx_i32m2(bsums_0,
+ a_ptr[l].bsums[j * 32 + 8] + a_ptr[l].bsums[j * 32 + 8 + 4],
+ __riscv_vget_v_i16m4_i16m1(mins, 1), 16);
+ bsums_1 = __riscv_vwmacc_vx_i32m2(bsums_1,
+ a_ptr[l].bsums[j * 32 + 8 + 1] + a_ptr[l].bsums[j * 32 + 8 + 5],
+ __riscv_vget_v_i16m4_i16m1(mins, 1), 16);
+ bsums_2 = __riscv_vwmacc_vx_i32m2(bsums_2,
+ a_ptr[l].bsums[j * 32 + 8 + 2] + a_ptr[l].bsums[j * 32 + 8 + 6],
+ __riscv_vget_v_i16m4_i16m1(mins, 1), 16);
+ bsums_3 = __riscv_vwmacc_vx_i32m2(bsums_3,
+ a_ptr[l].bsums[j * 32 + 8 + 3] + a_ptr[l].bsums[j * 32 + 8 + 7],
+ __riscv_vget_v_i16m4_i16m1(mins, 1), 16);
+ bsums_0 = __riscv_vwmacc_vx_i32m2(bsums_0,
+ a_ptr[l].bsums[j * 32 + 16] + a_ptr[l].bsums[j * 32 + 16 + 4],
+ __riscv_vget_v_i16m4_i16m1(mins, 2), 16);
+ bsums_1 = __riscv_vwmacc_vx_i32m2(bsums_1,
+ a_ptr[l].bsums[j * 32 + 16 + 1] + a_ptr[l].bsums[j * 32 + 16 + 5],
+ __riscv_vget_v_i16m4_i16m1(mins, 2), 16);
+ bsums_2 = __riscv_vwmacc_vx_i32m2(bsums_2,
+ a_ptr[l].bsums[j * 32 + 16 + 2] + a_ptr[l].bsums[j * 32 + 16 + 6],
+ __riscv_vget_v_i16m4_i16m1(mins, 2), 16);
+ bsums_3 = __riscv_vwmacc_vx_i32m2(bsums_3,
+ a_ptr[l].bsums[j * 32 + 16 + 3] + a_ptr[l].bsums[j * 32 + 16 + 7],
+ __riscv_vget_v_i16m4_i16m1(mins, 2), 16);
+ bsums_0 = __riscv_vwmacc_vx_i32m2(bsums_0,
+ a_ptr[l].bsums[j * 32 + 24 + 0] + a_ptr[l].bsums[j * 32 + 24 + 4],
+ __riscv_vget_v_i16m4_i16m1(mins, 3), 16);
+ bsums_1 = __riscv_vwmacc_vx_i32m2(bsums_1,
+ a_ptr[l].bsums[j * 32 + 24 + 1] + a_ptr[l].bsums[j * 32 + 24 + 5],
+ __riscv_vget_v_i16m4_i16m1(mins, 3), 16);
+ bsums_2 = __riscv_vwmacc_vx_i32m2(bsums_2,
+ a_ptr[l].bsums[j * 32 + 24 + 2] + a_ptr[l].bsums[j * 32 + 24 + 6],
+ __riscv_vget_v_i16m4_i16m1(mins, 3), 16);
+ bsums_3 = __riscv_vwmacc_vx_i32m2(bsums_3,
+ a_ptr[l].bsums[j * 32 + 24 + 3] + a_ptr[l].bsums[j * 32 + 24 + 7],
+ __riscv_vget_v_i16m4_i16m1(mins, 3), 16);
+
+ const vfloat32m2_t dmins_d_0 = __riscv_vfmul_vf_f32m2(dmins, a_ptr[l].d[0], 16);
+ const vfloat32m2_t dmins_d_1 = __riscv_vfmul_vf_f32m2(dmins, a_ptr[l].d[1], 16);
+ const vfloat32m2_t dmins_d_2 = __riscv_vfmul_vf_f32m2(dmins, a_ptr[l].d[2], 16);
+ const vfloat32m2_t dmins_d_3 = __riscv_vfmul_vf_f32m2(dmins, a_ptr[l].d[3], 16);
+
+ sumf_0 = __riscv_vfsub_vv_f32m2(sumf_0, __riscv_vfmul_vv_f32m2(dmins_d_0, __riscv_vfcvt_f_x_v_f32m2(bsums_0, 16), 16), 16);
+ sumf_1 = __riscv_vfsub_vv_f32m2(sumf_1, __riscv_vfmul_vv_f32m2(dmins_d_1, __riscv_vfcvt_f_x_v_f32m2(bsums_1, 16), 16), 16);
+ sumf_2 = __riscv_vfsub_vv_f32m2(sumf_2, __riscv_vfmul_vv_f32m2(dmins_d_2, __riscv_vfcvt_f_x_v_f32m2(bsums_2, 16), 16), 16);
+ sumf_3 = __riscv_vfsub_vv_f32m2(sumf_3, __riscv_vfmul_vv_f32m2(dmins_d_3, __riscv_vfcvt_f_x_v_f32m2(bsums_3, 16), 16), 16);
+
+
+ // Accumulation for 2 sub-blocks.
+ //
+ // This might overflow, so we accumulate in two steps.
+ //
+ // Recheck.
+ for (int k = 0; k < 2; k++) {
+ // 4x16 integer accumulators
+ vint16m1_t sumi_0_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_0_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 1024 + i * 16], 16);
+ const vint8mf2_t b_s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_0_packed, 0xF, 16));
+ const vint8mf2_t b_s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16));
+
+ sumi_0_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_0_s_0_16, a_ptr[l].qs[j * 512 + i * 4], b_s_0, 16);
+ sumi_1_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_1_s_0_16, a_ptr[l].qs[j * 512 + i * 4 + 1], b_s_0, 16);
+ sumi_2_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_2_s_0_16, a_ptr[l].qs[j * 512 + i * 4 + 2], b_s_0, 16);
+ sumi_3_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_3_s_0_16, a_ptr[l].qs[j * 512 + i * 4 + 3], b_s_0, 16);
+
+ sumi_0_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_0_s_1_16, a_ptr[l].qs[j * 512 + 128 + i * 4], b_s_1, 16);
+ sumi_1_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_1_s_1_16, a_ptr[l].qs[j * 512 + 128 + i * 4 + 1], b_s_1, 16);
+ sumi_2_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_2_s_1_16, a_ptr[l].qs[j * 512 + 128 + i * 4 + 2], b_s_1, 16);
+ sumi_3_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_3_s_1_16, a_ptr[l].qs[j * 512 + 128 + i * 4 + 3], b_s_1, 16);
+ }
+
+ sumi_0 = __riscv_vwmacc_vv_i32m2(sumi_0,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 0)),
+ sumi_0_s_0_16, 16);
+ sumi_0 = __riscv_vwmacc_vv_i32m2(sumi_0,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 1)),
+ sumi_0_s_1_16, 16);
+ sumi_1 = __riscv_vwmacc_vv_i32m2(sumi_1,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 0)),
+ sumi_1_s_0_16, 16);
+ sumi_1 = __riscv_vwmacc_vv_i32m2(sumi_1,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 1)),
+ sumi_1_s_1_16, 16);
+ sumi_2 = __riscv_vwmacc_vv_i32m2(sumi_2,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 0)),
+ sumi_2_s_0_16, 16);
+ sumi_2 = __riscv_vwmacc_vv_i32m2(sumi_2,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 1)),
+ sumi_2_s_1_16, 16);
+ sumi_3 = __riscv_vwmacc_vv_i32m2(sumi_3,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 0)),
+ sumi_3_s_0_16, 16);
+ sumi_3 = __riscv_vwmacc_vv_i32m2(sumi_3,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 1)),
+ sumi_3_s_1_16, 16);
+ }
+ // Accumulation for 2 sub-blocks.
+ //
+ // This might overflow, so we accumulate in two steps.
+ //
+ // Recheck.
+ for (int k = 0; k < 2; k++) {
+ // 4x16 integer accumulators
+ vint16m1_t sumi_0_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_0_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_1_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_2_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+ vint16m1_t sumi_3_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, 16);
+
+ for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 1024 + 512 + i * 16], 16);
+ const vint8mf2_t b_s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_0_packed, 0xF, 16));
+ const vint8mf2_t b_s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16));
+
+ sumi_0_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_0_s_0_16, a_ptr[l].qs[j * 512 + 256 + i * 4], b_s_0, 16);
+ sumi_1_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_1_s_0_16, a_ptr[l].qs[j * 512 + 256 + i * 4 + 1], b_s_0, 16);
+ sumi_2_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_2_s_0_16, a_ptr[l].qs[j * 512 + 256 + i * 4 + 2], b_s_0, 16);
+ sumi_3_s_0_16 = __riscv_vwmacc_vx_i16m1(sumi_3_s_0_16, a_ptr[l].qs[j * 512 + 256 + i * 4 + 3], b_s_0, 16);
+
+ sumi_0_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_0_s_1_16, a_ptr[l].qs[j * 512 + 384 + i * 4], b_s_1, 16);
+ sumi_1_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_1_s_1_16, a_ptr[l].qs[j * 512 + 384 + i * 4 + 1], b_s_1, 16);
+ sumi_2_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_2_s_1_16, a_ptr[l].qs[j * 512 + 384 + i * 4 + 2], b_s_1, 16);
+ sumi_3_s_1_16 = __riscv_vwmacc_vx_i16m1(sumi_3_s_1_16, a_ptr[l].qs[j * 512 + 384 + i * 4 + 3], b_s_1, 16);
+ }
+
+ sumi_0 = __riscv_vwmacc_vv_i32m2(sumi_0,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 2)),
+ sumi_0_s_0_16, 16);
+ sumi_0 = __riscv_vwmacc_vv_i32m2(sumi_0,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 3)),
+ sumi_0_s_1_16, 16);
+ sumi_1 = __riscv_vwmacc_vv_i32m2(sumi_1,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 2)),
+ sumi_1_s_0_16, 16);
+ sumi_1 = __riscv_vwmacc_vv_i32m2(sumi_1,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 3)),
+ sumi_1_s_1_16, 16);
+ sumi_2 = __riscv_vwmacc_vv_i32m2(sumi_2,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 2)),
+ sumi_2_s_0_16, 16);
+ sumi_2 = __riscv_vwmacc_vv_i32m2(sumi_2,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 3)),
+ sumi_2_s_1_16, 16);
+ sumi_3 = __riscv_vwmacc_vv_i32m2(sumi_3,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 2)),
+ sumi_3_s_0_16, 16);
+ sumi_3 = __riscv_vwmacc_vv_i32m2(sumi_3,
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vget_v_u16m4_u16m1(scales, 3)),
+ sumi_3_s_1_16, 16);
+ }
+ }
+
+ const vfloat32m2_t b_d = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16), 16);
+ const vfloat32m2_t d_0 = __riscv_vfmul_vf_f32m2(b_d, a_ptr[l].d[0], 16);
+ const vfloat32m2_t d_1 = __riscv_vfmul_vf_f32m2(b_d, a_ptr[l].d[1], 16);
+ const vfloat32m2_t d_2 = __riscv_vfmul_vf_f32m2(b_d, a_ptr[l].d[2], 16);
+ const vfloat32m2_t d_3 = __riscv_vfmul_vf_f32m2(b_d, a_ptr[l].d[3], 16);
+
+ sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
+ sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
+ sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
+ sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
+ }
+ }
+ return;
+#endif
+ ggml_gemm_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16);
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
+
+ // 4x16 Accumulators
+ vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 4x16 integer accumulators
+ vint32m2_t sumi_0 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_1 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_2 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_3 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK4_NL / 2; i++) {
+ // Load `b_ptr`.
+ const vuint8mf2_t b_0_packed = __riscv_vle8_v_u8mf2((const uint8_t *)&b_ptr[l].qs[i * 16], 16);
+ const vint8mf2_t b_0_lo = __riscv_vrgather_vv_i8mf2(values, __riscv_vand_vx_u8mf2(b_0_packed, 0xf, 16), 16);
+ const vint8mf2_t b_0_hi = __riscv_vrgather_vv_i8mf2(values, __riscv_vsrl_vx_u8mf2(b_0_packed, 4, 16), 16);
+ // const vint16m1_t b_0_lo_16 = __riscv_vwcvt_x_x_v_i16m1(b_0_lo, 16);
+ // const vint16m1_t b_0_hi_16 = __riscv_vwcvt_x_x_v_i16m1(b_0_hi, 16);
+
+ const vint16m1_t sumi_0_lo = __riscv_vwmul_vx_i16m1(b_0_lo, a_ptr[l].qs[i * 4], 16);
+ const vint16m1_t sumi_1_lo = __riscv_vwmul_vx_i16m1(b_0_lo, a_ptr[l].qs[i * 4 + 1], 16);
+ const vint16m1_t sumi_2_lo = __riscv_vwmul_vx_i16m1(b_0_lo, a_ptr[l].qs[i * 4 + 2], 16);
+ const vint16m1_t sumi_3_lo = __riscv_vwmul_vx_i16m1(b_0_lo, a_ptr[l].qs[i * 4 + 3], 16);
+
+ const vint16m1_t sumi_0_hi = __riscv_vwmul_vx_i16m1(b_0_hi, a_ptr[l].qs[64 + i * 4], 16);
+ const vint16m1_t sumi_1_hi = __riscv_vwmul_vx_i16m1(b_0_hi, a_ptr[l].qs[64 + i * 4 + 1], 16);
+ const vint16m1_t sumi_2_hi = __riscv_vwmul_vx_i16m1(b_0_hi, a_ptr[l].qs[64 + i * 4 + 2], 16);
+ const vint16m1_t sumi_3_hi = __riscv_vwmul_vx_i16m1(b_0_hi, a_ptr[l].qs[64 + i * 4 + 3], 16);
+
+ sumi_0 = __riscv_vadd_vv_i32m2(sumi_0, __riscv_vwadd_vv_i32m2(sumi_0_lo, sumi_0_hi, 16), 16);
+ sumi_1 = __riscv_vadd_vv_i32m2(sumi_1, __riscv_vwadd_vv_i32m2(sumi_1_lo, sumi_1_hi, 16), 16);
+ sumi_2 = __riscv_vadd_vv_i32m2(sumi_2, __riscv_vwadd_vv_i32m2(sumi_2_lo, sumi_2_hi, 16), 16);
+ sumi_3 = __riscv_vadd_vv_i32m2(sumi_3, __riscv_vwadd_vv_i32m2(sumi_3_lo, sumi_3_hi, 16), 16);
+ }
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[0], 16);
+ const vfloat32m2_t d_1 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[1], 16);
+ const vfloat32m2_t d_2 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[2], 16);
+ const vfloat32m2_t d_3 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[3], 16);
+
+ sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
+ sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
+ sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
+ sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
+ }
+ }
+ return;
+#endif
+ ggml_gemm_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+#if defined __riscv_v_intrinsic
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
+
+ // 4x16 Accumulators
+ vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+ vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
+
+ for (int l = 0; l < nb; l++) {
+ // 4x16 Integer Accumulators
+ vint32m2_t sumi_0 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_1 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_2 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+ vint32m2_t sumi_3 = __riscv_vmv_v_x_i32m2(0.0f, 16);
+
+ // Accumulation loop.
+ for (int i = 0; i < QK8_0; i++) {
+ // Load `b_ptr`.
+ const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
+ // const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
+
+ sumi_0 = __riscv_vwadd_wv_i32m2(sumi_0, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 0], 16), 16);
+ sumi_1 = __riscv_vwadd_wv_i32m2(sumi_1, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 1], 16), 16);
+ sumi_2 = __riscv_vwadd_wv_i32m2(sumi_2, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 2], 16), 16);
+ sumi_3 = __riscv_vwadd_wv_i32m2(sumi_3, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 3], 16), 16);
+ }
+
+ const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((const _Float16 *)b_ptr[l].d, 16);
+ const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[0], 16);
+ const vfloat32m2_t d_1 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[1], 16);
+ const vfloat32m2_t d_2 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[2], 16);
+ const vfloat32m2_t d_3 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[3], 16);
+
+ sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
+ sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
+ sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
+ sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
+ }
+
+ __riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
+ __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
+ }
+ }
+ return;
+#endif
+ ggml_gemm_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ assert(n % QK_K == 0);
+ const int num_k_blocks = n / QK_K;
+ const int N_ROWS_TILE = 4;
+ const int N_COLS_TILE = 16;
+ assert(nr % N_ROWS_TILE == 0);
+ assert(nc % N_COLS_TILE == 0);
+
+ const size_t vl = __riscv_vsetvl_e32m2(N_COLS_TILE);
+ // --- Tiling Loops ---
+#pragma GCC unroll 1
+ for (int row_tile = 0; row_tile < nr; row_tile += N_ROWS_TILE) {
+#pragma GCC unroll 1
+ for (int col_tile = 0; col_tile < nc; col_tile += N_COLS_TILE) {
+ // Base Pointers
+ const block_q8_Kx4* lhs_base_ptr = (const block_q8_Kx4*)vy + (row_tile / N_ROWS_TILE) * num_k_blocks;
+ const block_q2_Kx16* rhs_base_ptr = (const block_q2_Kx16*)vx + (col_tile / N_COLS_TILE) * num_k_blocks;
+
+ // Persistent Float Accumulators
+ vfloat32m2_t v_sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+ vfloat32m2_t v_sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+ vfloat32m2_t v_sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+ vfloat32m2_t v_sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+
+ // --- Super-Block Loop (K=0..255) ---
+#pragma GCC unroll 1
+ for (int k_block = 0; k_block < num_k_blocks; ++k_block) {
+ const block_q8_Kx4* lhs_current = &lhs_base_ptr[k_block];
+ const block_q2_Kx16* rhs_current = &rhs_base_ptr[k_block];
+
+ // 1. Load Global Min Scales (Keep as F16/LMUL=1 to save registers)
+ vfloat16m1_t v_g_min_f16 = __riscv_vle16_v_f16m1((const _Float16*)rhs_current->dmin, vl);
+ vfloat32m2_t v_g_min_base = __riscv_vfwcvt_f_f_v_f32m2(v_g_min_f16, vl);
+
+ // 2. Initialize Integer Accumulators
+ vint32m2_t v_isum_0 = __riscv_vmv_v_x_i32m2(0, vl);
+ vint32m2_t v_isum_1 = __riscv_vmv_v_x_i32m2(0, vl);
+ vint32m2_t v_isum_2 = __riscv_vmv_v_x_i32m2(0, vl);
+ vint32m2_t v_isum_3 = __riscv_vmv_v_x_i32m2(0, vl);
+
+ const uint8_t* rhs_qs_ptr = rhs_current->qs;
+ const uint8_t* rhs_sc_ptr = rhs_current->scales;
+ const int8_t* lhs_qs_ptr = lhs_current->qs;
+
+ // --- Phase Loop (4 phases x 64 elements) ---
+#pragma GCC unroll 1
+ for (int phase = 0; phase < 4; ++phase) {
+
+ // A. Load Scales/Mins for the 4 interleaved sub-blocks
+ vuint16m1_t v_d_sb_0, v_d_sb_1, v_d_sb_2, v_d_sb_3;
+ vuint16m1_t v_m_sb_0, v_m_sb_1, v_m_sb_2, v_m_sb_3;
+
+ // Unrolled Load Logic
+ {
+ vuint8mf2_t v_raw;
+ // Sub-block 0
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 0, vl);
+ v_d_sb_0 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_0 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 1
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 16, vl);
+ v_d_sb_1 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_1 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 2
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 32, vl);
+ v_d_sb_2 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_2 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ // Sub-block 3
+ v_raw = __riscv_vle8_v_u8mf2(rhs_sc_ptr + 48, vl);
+ v_d_sb_3 = __riscv_vzext_vf2_u16m1(__riscv_vand_vx_u8mf2(v_raw, 0xF, vl), vl);
+ v_m_sb_3 = __riscv_vzext_vf2_u16m1(__riscv_vsrl_vx_u8mf2(v_raw, 4, vl), vl);
+
+ rhs_sc_ptr += 64;
+ }
+
+ int base_k_phase = (phase < 2) ? (phase * 16) : (128 + (phase-2)*16);
+ int k_offsets[4] = {0, 32, 64, 96};
+
+ // B. Inner Dot Product Loop
+#pragma GCC unroll 1
+ for (int l = 0; l < 16; ++l) {
+ vuint8mf2_t v_rhs_data = __riscv_vle8_v_u8mf2(rhs_qs_ptr, vl);
+ rhs_qs_ptr += 16;
+
+ // Unroll over 4 sub-blocks (0, 1, 2, 3 relative to phase)
+
+ // --- Sub-block 0 ---
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(v_rhs_data, 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_0), vl);
+
+ const int8_t* q8 = &lhs_qs_ptr[(base_k_phase + k_offsets[0] + l) * 4];
+ v_isum_0 = __riscv_vwmacc_vx_i32m2(v_isum_0, (int16_t)q8[0], v_w, vl);
+ v_isum_1 = __riscv_vwmacc_vx_i32m2(v_isum_1, (int16_t)q8[1], v_w, vl);
+ v_isum_2 = __riscv_vwmacc_vx_i32m2(v_isum_2, (int16_t)q8[2], v_w, vl);
+ v_isum_3 = __riscv_vwmacc_vx_i32m2(v_isum_3, (int16_t)q8[3], v_w, vl);
+ }
+ // --- Sub-block 1 ---
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 2, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_1), vl);
+
+ const int8_t* q8 = &lhs_qs_ptr[(base_k_phase + k_offsets[1] + l) * 4];
+ v_isum_0 = __riscv_vwmacc_vx_i32m2(v_isum_0, (int16_t)q8[0], v_w, vl);
+ v_isum_1 = __riscv_vwmacc_vx_i32m2(v_isum_1, (int16_t)q8[1], v_w, vl);
+ v_isum_2 = __riscv_vwmacc_vx_i32m2(v_isum_2, (int16_t)q8[2], v_w, vl);
+ v_isum_3 = __riscv_vwmacc_vx_i32m2(v_isum_3, (int16_t)q8[3], v_w, vl);
+ }
+ // --- Sub-block 2 ---
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 4, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_2), vl);
+
+ const int8_t* q8 = &lhs_qs_ptr[(base_k_phase + k_offsets[2] + l) * 4];
+ v_isum_0 = __riscv_vwmacc_vx_i32m2(v_isum_0, (int16_t)q8[0], v_w, vl);
+ v_isum_1 = __riscv_vwmacc_vx_i32m2(v_isum_1, (int16_t)q8[1], v_w, vl);
+ v_isum_2 = __riscv_vwmacc_vx_i32m2(v_isum_2, (int16_t)q8[2], v_w, vl);
+ v_isum_3 = __riscv_vwmacc_vx_i32m2(v_isum_3, (int16_t)q8[3], v_w, vl);
+ }
+ // --- Sub-block 3 ---
+ {
+ vuint8mf2_t v_q2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(v_rhs_data, 6, vl), 3, vl);
+ vint16m1_t v_w = __riscv_vmul_vv_i16m1(
+ __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(v_q2, vl)),
+ __riscv_vreinterpret_v_u16m1_i16m1(v_d_sb_3), vl);
+
+ const int8_t* q8 = &lhs_qs_ptr[(base_k_phase + k_offsets[3] + l) * 4];
+ v_isum_0 = __riscv_vwmacc_vx_i32m2(v_isum_0, (int16_t)q8[0], v_w, vl);
+ v_isum_1 = __riscv_vwmacc_vx_i32m2(v_isum_1, (int16_t)q8[1], v_w, vl);
+ v_isum_2 = __riscv_vwmacc_vx_i32m2(v_isum_2, (int16_t)q8[2], v_w, vl);
+ v_isum_3 = __riscv_vwmacc_vx_i32m2(v_isum_3, (int16_t)q8[3], v_w, vl);
+ }
+ }
+
+ // C CORRECTION
+ int sb_base_abs = base_k_phase / 16;
+
+ // --- Correction Sub-block 0 ---
+ {
+ int sb_abs = sb_base_abs + (k_offsets[0] / 16);
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_0);
+
+ // Row 0
+ vfloat32m2_t v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[0], vl);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 0], vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_0 = __riscv_vfsub_vv_f32m2(v_sumf_0, vf_c, vl);
+
+ // Row 1
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[1], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 1], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_1 = __riscv_vfsub_vv_f32m2(v_sumf_1, vf_c, vl);
+
+ // Row 2
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[2], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 2], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_2 = __riscv_vfsub_vv_f32m2(v_sumf_2, vf_c, vl);
+
+ // Row 3
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[3], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 3], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_3 = __riscv_vfsub_vv_f32m2(v_sumf_3, vf_c, vl);
+ }
+
+ // --- Correction Sub-block 1 ---
+ {
+ int sb_abs = sb_base_abs + (k_offsets[1] / 16);
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_1);
+
+ vfloat32m2_t v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[0], vl);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 0], vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_0 = __riscv_vfsub_vv_f32m2(v_sumf_0, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[1], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 1], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_1 = __riscv_vfsub_vv_f32m2(v_sumf_1, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[2], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 2], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_2 = __riscv_vfsub_vv_f32m2(v_sumf_2, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[3], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 3], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_3 = __riscv_vfsub_vv_f32m2(v_sumf_3, vf_c, vl);
+ }
+
+ // --- Correction Sub-block 2 ---
+ {
+ int sb_abs = sb_base_abs + (k_offsets[2] / 16);
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_2);
+
+ vfloat32m2_t v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[0], vl);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 0], vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_0 = __riscv_vfsub_vv_f32m2(v_sumf_0, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[1], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 1], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_1 = __riscv_vfsub_vv_f32m2(v_sumf_1, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[2], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 2], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_2 = __riscv_vfsub_vv_f32m2(v_sumf_2, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[3], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 3], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_3 = __riscv_vfsub_vv_f32m2(v_sumf_3, vf_c, vl);
+ }
+
+ // --- Correction Sub-block 3 ---
+ {
+ int sb_abs = sb_base_abs + (k_offsets[3] / 16);
+ vint16m1_t v_min = __riscv_vreinterpret_v_u16m1_i16m1(v_m_sb_3);
+
+ vfloat32m2_t v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[0], vl);
+ vint32m2_t v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 0], vl);
+ vfloat32m2_t vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_0 = __riscv_vfsub_vv_f32m2(v_sumf_0, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[1], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 1], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_1 = __riscv_vfsub_vv_f32m2(v_sumf_1, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[2], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 2], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_2 = __riscv_vfsub_vv_f32m2(v_sumf_2, vf_c, vl);
+
+ v_g_min = __riscv_vfmul_vf_f32m2(v_g_min_base, lhs_current->d[3], vl);
+ v_c = __riscv_vwmul_vx_i32m2(v_min, lhs_current->bsums[sb_abs * 4 + 3], vl);
+ vf_c = __riscv_vfmul_vv_f32m2(__riscv_vfcvt_f_x_v_f32m2(v_c, vl), v_g_min, vl);
+ v_sumf_3 = __riscv_vfsub_vv_f32m2(v_sumf_3, vf_c, vl);
+ }
+
+ } // End Phase Loop
+
+ // --- Apply Main Scales ---
+ vfloat16m1_t v_g_all_f16 = __riscv_vle16_v_f16m1((const _Float16*)rhs_current->d, vl);
+ vfloat32m2_t v_g_all_base = __riscv_vfwcvt_f_f_v_f32m2(v_g_all_f16, vl);
+
+ {
+ vfloat32m2_t v_g_all = __riscv_vfmul_vf_f32m2(v_g_all_base, lhs_current->d[0], vl);
+ vfloat32m2_t v_sum = __riscv_vfcvt_f_x_v_f32m2(v_isum_0, vl);
+ v_sum = __riscv_vfmul_vv_f32m2(v_sum, v_g_all, vl);
+ v_sumf_0 = __riscv_vfadd_vv_f32m2(v_sumf_0, v_sum, vl);
+ }
+ // Row 1
+ {
+ vfloat32m2_t v_g_all = __riscv_vfmul_vf_f32m2(v_g_all_base, lhs_current->d[1], vl);
+ vfloat32m2_t v_sum = __riscv_vfcvt_f_x_v_f32m2(v_isum_1, vl);
+ v_sum = __riscv_vfmul_vv_f32m2(v_sum, v_g_all, vl);
+ v_sumf_1 = __riscv_vfadd_vv_f32m2(v_sumf_1, v_sum, vl);
+ }
+ // Row 2
+ {
+ vfloat32m2_t v_g_all = __riscv_vfmul_vf_f32m2(v_g_all_base, lhs_current->d[2], vl);
+ vfloat32m2_t v_sum = __riscv_vfcvt_f_x_v_f32m2(v_isum_2, vl);
+ v_sum = __riscv_vfmul_vv_f32m2(v_sum, v_g_all, vl);
+ v_sumf_2 = __riscv_vfadd_vv_f32m2(v_sumf_2, v_sum, vl);
+ }
+ // Row 3
+ {
+ vfloat32m2_t v_g_all = __riscv_vfmul_vf_f32m2(v_g_all_base, lhs_current->d[3], vl);
+ vfloat32m2_t v_sum = __riscv_vfcvt_f_x_v_f32m2(v_isum_3, vl);
+ v_sum = __riscv_vfmul_vv_f32m2(v_sum, v_g_all, vl);
+ v_sumf_3 = __riscv_vfadd_vv_f32m2(v_sumf_3, v_sum, vl);
+ }
+
+ } // End K-Block
+
+ __riscv_vse32_v_f32m2(s + (row_tile + 0) * bs + col_tile, v_sumf_0, vl);
+ __riscv_vse32_v_f32m2(s + (row_tile + 1) * bs + col_tile, v_sumf_1, vl);
+ __riscv_vse32_v_f32m2(s + (row_tile + 2) * bs + col_tile, v_sumf_2, vl);
+ __riscv_vse32_v_f32m2(s + (row_tile + 3) * bs + col_tile, v_sumf_3, vl);
+ }
+ }
+}
extern "C" {
+#if defined __riscv_zvfh
+void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+ assert(QK8_0 == 32);
+ assert(k % QK8_0 == 0);
+ const int nb = k / QK8_0;
+
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+ // scalar
+ const int blck_size_interleave = 1;
+ float srcv[4][QK8_0];
+ float id[4];
+
+ for (int i = 0; i < nb; i++) {
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
+ float amax = 0.0f; // absolute max
+
+ for (int j = 0; j < QK8_0; j++) {
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
+ }
+
+ const float d = amax / ((1 << 7) - 1);
+ id[row_iter] = d ? 1.0f / d : 0.0f;
+
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+ }
+
+ for (int j = 0; j < QK8_0 * 4; j++) {
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+ src_offset += (j % blck_size_interleave);
+
+ float x0 = srcv[src_id][src_offset] * id[src_id];
+ y[i].qs[j] = roundf(x0);
+ }
+ }
+}
+
+void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+ assert(QK_K == 256);
+ assert(k % QK_K == 0);
+ const int nb = k / QK_K;
+
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
+
+ const int blck_size_interleave = 1;
+ float srcv[4][QK_K];
+ float iscale[4];
+
+ for (int i = 0; i < nb; i++) {
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
+ float amax = 0.0f; // absolute max
+ float max = 0;
+
+ for (int j = 0; j < QK_K; j++) {
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
+ // Update the maximum value of the corresponding super block
+ if(amax < fabsf(srcv[row_iter][j])) {
+ amax = fabsf(srcv[row_iter][j]);
+ max = srcv[row_iter][j];
+ }
+ }
+
+ iscale[row_iter] = amax ? -127.f/max : 0;
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
+ }
+
+ for (int j = 0; j < QK_K / 4; j++) {
+ y[i].bsums[j] = 0;
+ }
+ for (int j = 0; j < QK_K * 4; j++) {
+ int src_id = j % 4;
+ int src_offset = j / 4;
+ int index = ((j >> 6) << 2) + (j & 3);
+
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
+ y[i].qs[j] = nearest_int(x0);
+ y[i].bsums[index] += y[i].qs[j];
+ }
+ }
+}
+#endif
+
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32);
assert(k % QK8_0 == 0);
}
}
-
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK_K == 256);
assert(k % QK_K == 0);
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
}
+#if defined __riscv_zvfh
+template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
+ assert(nrow == 4);
+ UNUSED(nrow);
+ ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
+}
+
+template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
+ assert(nrow == 4);
+ UNUSED(nrow);
+ ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
+}
+#endif
+
template <int M, int N>
static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
float * GGML_RESTRICT s,
}
}
+#if defined __riscv_zvfh
+void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+ float sumf[16];
+ int sumi;
+
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
+
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+ }
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+ }
+ }
+ }
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+ }
+}
+
+void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK_K;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+ assert (n % qk == 0);
+ assert (nc % ncols_interleaved == 0);
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+ float sumf[16];
+ float sum_minf[16];
+ uint8_t scales[128];
+ uint8_t mins[128];
+ int sumi1;
+ int sumi2;
+ int sumi;
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumf[j] = 0.0f;
+ sum_minf[j] = 0.0f;
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int i = 0; i < 128; i++) {
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
+ mins[i] = b_ptr[l].scales[i] >> 4;
+ }
+ for (int i = 0; i < 64; i++) {
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
+ }
+ for (int sb = 0; sb < 8; sb++) {
+ uint8_t *min = &mins[sb * 16];
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
+ }
+ }
+ for (int sb = 0; sb < 8; sb += 2) {
+ uint8_t *scales_0 = &scales[sb * 16];
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
+ for (int i = 0; i < QK4_0; i++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi1 = 0;
+ sumi2 = 0;
+ sumi = 0;
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
+ sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
+ sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
+ sumi1 = sumi1 * scales_0[j];
+ sumi2 = sumi2 * scales_1[j];
+ sumi += sumi1 + sumi2;
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
+ }
+ }
+ }
+ }
+ for (int j = 0; j < ncols_interleaved; j++) {
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
+ }
+ }
+}
+
+void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert(nr == 1);
+ assert(n % qk == 0);
+ assert(nc % ncols_interleaved == 0);
+
+ UNUSED(bs);
+ UNUSED(nr);
+
+ float sumf[16];
+ int sumi;
+
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
+
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+ }
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+ }
+ }
+ }
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+ }
+}
+
+void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert(nr == 1);
+ assert(n % qk == 0);
+ assert(nc % ncols_interleaved == 0);
+
+ UNUSED(bs);
+ UNUSED(nr);
+
+ float sumf[16];
+ int sumi;
+
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
+
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumf[j] = 0.0;
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / blocklen); k++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
+ }
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+ }
+ }
+ }
+ for (int j = 0; j < ncols_interleaved; j++) {
+ s[x * ncols_interleaved + j] = sumf[j];
+ }
+ }
+}
+
+void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ assert(n % QK_K == 0);
+ assert(nr == 1);
+ assert(nc % 16 == 0);
+
+ UNUSED(bs);
+
+ const int nb = n / QK_K;
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
+ const block_q8_K * y = (const block_q8_K *)vy;
+
+ // Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
+ const int sb_perm[16] = {
+ 0, 4, 1, 5, 2, 6, 3, 7, // 0-7
+ 8, 12, 9, 13, 10, 14, 11, 15 // 8-15
+ };
+
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
+ const block_q8_K * y_ptr = y;
+
+ float sumf[16] = {0};
+
+ // Loop over K-blocks
+ for (int k_block = 0; k_block < nb; ++k_block) {
+ int32_t isum[16] = {0};
+ int32_t summs[16] = {0};
+
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
+
+ // Iterate over sub-blocks 0..15
+ for (int sb = 0; sb < 16; ++sb) {
+ // Correction Term
+ int16_t bsum = bs_lhs[sb];
+ int scale_offset = sb_perm[sb] * 16;
+
+ for (int col = 0; col < 16; ++col) {
+ uint8_t sc_val = sc_rhs[scale_offset + col];
+ summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
+ }
+
+ // Main Dot Product
+ // Calculate base offsets for Q2 unpacking based on SB
+ int byte_base;
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
+
+ int shift = ((sb / 2) % 4) * 2;
+
+ for (int col = 0; col < 16; ++col) {
+ uint8_t sc_val = sc_rhs[scale_offset + col];
+ int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
+
+ // Process 16 elements (l=0..15)
+ for (int l = 0; l < 16; ++l) {
+ // Q2: Interleaved by column. Byte `l` contains 4 k-values.
+ int qs_idx = (byte_base + l) * 16 + col;
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
+
+ // Q8: Linear access
+ int k = sb * 16 + l;
+ int8_t q8_val = qs_lhs[k];
+
+ isum[col] += q8_val * q2_val * d_sb;
+ }
+ }
+ }
+
+ // Finalize K-Block
+ for (int col = 0; col < 16; ++col) {
+ float d_lhs = y_ptr[k_block].d;
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
+
+ float d_all = d_lhs * d_rhs;
+ float d_min = d_lhs * dm_rhs;
+
+ sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
+ }
+ }
+
+ for (int col = 0; col < 16; ++col) {
+ s[col_tile + col] = sumf[col];
+ }
+ }
+}
+#endif
+
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
}
}
+
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
}
}
+#if defined __riscv_zvfh
+void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+ float sumf[4][16];
+ int sumi;
+
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+ }
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+ }
+ }
+ }
+ }
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++)
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+ }
+ }
+ }
+}
+
+void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK_K;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert (n % qk == 0);
+ assert (nr % 4 == 0);
+ assert (nc % ncols_interleaved == 0);
+
+ UNUSED(s);
+ UNUSED(bs);
+ UNUSED(vx);
+ UNUSED(vy);
+ UNUSED(nr);
+ UNUSED(nc);
+ UNUSED(nb);
+ UNUSED(ncols_interleaved);
+ UNUSED(blocklen);
+
+ float sumf[4][16];
+ float sum_minf[4][16];
+ uint8_t scales[128];
+ uint8_t mins[128];
+ int sumi1;
+ int sumi2;
+ int sumi;
+
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumf[m][j] = 0.0;
+ sum_minf[m][j] = 0.0;
+ }
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int i = 0; i < 128; i++) {
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
+ mins[i] = b_ptr[l].scales[i] >> 4;
+ }
+ for (int i = 0; i < 64; i++) {
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
+ }
+
+ for (int sb = 0; sb < 8; sb++) {
+ uint8_t *min = &mins[sb * 16];
+ for(int m = 0; m < 4; m++) {
+ const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
+ for(int j = 0; j < ncols_interleaved; j++) {
+ sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
+ }
+ }
+ }
+
+ for (int sb = 0; sb < 8; sb += 2) {
+ uint8_t *scales_0 = &scales[sb * 16];
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
+
+ for (int i = 0; i < QK4_0; i++) {
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi1 = 0;
+ sumi2 = 0;
+ sumi = 0;
+
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
+ sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
+ sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
+ sumi1 = sumi1 * scales_0[j];
+ sumi2 = sumi2 * scales_1[j];
+ sumi += sumi1 + sumi2;
+
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
+ }
+ }
+ }
+ }
+ }
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
+ }
+ }
+ }
+ }
+}
+
+void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert(n % qk == 0);
+ assert(nr % 4 == 0);
+ assert(nc % ncols_interleaved == 0);
+
+ float sumf[4][16];
+ int sumi;
+
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
+ }
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+ }
+ }
+ }
+ }
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++)
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+ }
+ }
+ }
+}
+
+void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+ const int ncols_interleaved = 16;
+ const int blocklen = 1;
+
+ assert(n % qk == 0);
+ assert(nr % 4 == 0);
+ assert(nc % ncols_interleaved == 0);
+
+ float sumf[4][16];
+ int sumi;
+
+ for (int y = 0; y < nr / 4; y++) {
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumf[m][j] = 0.0;
+ }
+ }
+ for (int l = 0; l < nb; l++) {
+ for (int k = 0; k < (qk / blocklen); k++) {
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ sumi = 0;
+ for (int i = 0; i < blocklen; ++i) {
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
+ }
+ sumf[m][j] +=
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+ }
+ }
+ }
+ }
+ for (int m = 0; m < 4; m++) {
+ for (int j = 0; j < ncols_interleaved; j++) {
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+ }
+ }
+ }
+ }
+}
+
+
+void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+ assert(n % QK_K == 0);
+ assert(nr % 4 == 0);
+ assert(nc % 16 == 0);
+ const int nb = n / QK_K;
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
+ const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
+
+ const int sb_perm[16] = {
+ 0, 4, 1, 5, 2, 6, 3, 7,
+ 8, 12, 9, 13, 10, 14, 11, 15
+ };
+
+ // Iterate Rows in tiles of 4
+ for (int row_tile = 0; row_tile < nr; row_tile += 4) {
+ // Iterate Columns in tiles of 16
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
+
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
+ const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
+
+ float sumf[4][16];
+ memset(sumf, 0, sizeof(sumf));
+
+ for (int k_block = 0; k_block < nb; ++k_block) {
+ int32_t isum[4][16];
+ int32_t summs[4][16];
+ memset(isum, 0, sizeof(isum));
+ memset(summs, 0, sizeof(summs));
+
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
+
+ for (int sb = 0; sb < 16; ++sb) {
+ int scale_offset = sb_perm[sb] * 16;
+
+ int byte_base;
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
+ int shift = ((sb / 2) % 4) * 2;
+
+ for (int col = 0; col < 16; ++col) {
+ uint8_t sc_val = sc_rhs[scale_offset + col];
+ int32_t d_sb = sc_val & 0xF;
+ int32_t m_sb = sc_val >> 4;
+
+ // Correction Term
+ for (int r = 0; r < 4; ++r) {
+ int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
+ summs[r][col] += bs_lhs[bsum_idx] * m_sb;
+ }
+
+ // Main Dot Product
+ for (int l = 0; l < 16; ++l) {
+ int qs_idx = (byte_base + l) * 16 + col;
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
+
+ // Calculate Q8 index for this specific k and row
+ int k = sb * 16 + l;
+ int q8_idx = (k / 4) * 16 + (k % 4);
+
+ for (int r = 0; r < 4; ++r) {
+ // Add r*4 to jump to the correct row within the 4x4 chunk
+ int8_t q8_val = qs_lhs[q8_idx + r * 4];
+ isum[r][col] += q8_val * q2_val * d_sb;
+ }
+ }
+ }
+ }
+
+ // Finalize K-Block
+ for (int col = 0; col < 16; ++col) {
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
+
+ for (int r = 0; r < 4; ++r) {
+ float d_lhs = y_ptr[k_block].d[r];
+ float d_all = d_lhs * d_rhs;
+ float d_min = d_lhs * dm_rhs;
+ sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
+ }
+ }
+ }
+
+ for (int r = 0; r < 4; ++r) {
+ for (int col = 0; col < 16; ++col) {
+ s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
+ }
+ }
+ }
+ }
+}
+#endif
+
} // extern "C"
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
return out;
}
+static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
+ block_q4_0x16 out;
+
+ for (int i = 0; i < 16; i++) {
+ out.d[i] = in[i].d;
+ }
+
+ const int end = QK4_0 * 8 / blck_size_interleave;
+
+ if (blck_size_interleave == 1) {
+ const uint8_t xor_mask = 0x88;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 16;
+ int src_offset = i / 16;
+ int dst_offset = i;
+
+ out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ return out;
+}
+
static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
block_q4_Kx8 out;
//Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
return out;
}
+static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
+ block_q4_Kx16 out;
+ //Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
+ for (int i = 0; i < 16; i++) {
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
+ }
+
+ for (int i = 0; i < 16; i++) {
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
+ }
+
+ const int end = QK_K * 8 / blck_size_interleave;
+
+ if (blck_size_interleave == 1) {
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 16;
+ int src_offset = i / 16;
+ int dst_offset = i;
+
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
+ }
+
+ // RVV repacking.
+ //
+ // Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
+ uint8_t s[128], m[128];
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 16; j++) {
+ s[i * 16 + j] = in[j].scales[i] & 63;
+ m[i * 16 + j] = in[j].scales[i + 4] & 63;
+ }
+ }
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 16; j++) {
+ s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
+ m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
+ }
+ }
+
+ for (int i = 0; i < 128; i++) {
+ out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
+ }
+ for (int i = 0; i < 64; i++) {
+ out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ return out;
+}
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
block_q2_Kx8 out;
return out;
}
+static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
+ block_q2_Kx16 out;
+ constexpr int N_COLS = 16;
+
+ // 1. Copy Super-Scales (d) and Super-Mins (dmin)
+ for (int i = 0; i < N_COLS; i++) {
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
+ }
+
+ // 2. Interleave Q2_K Data
+ const int bytes_per_col = 64;
+ const int total_bytes = N_COLS * bytes_per_col;
+ const int end = total_bytes / blck_size_interleave;
+
+ for (int i = 0; i < end; ++i) {
+ int src_col_id = i % N_COLS;
+ int src_offset = (i / N_COLS) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+ memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
+ }
+
+ // 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
+ int out_idx = 0;
+
+ // Arrays define the sub-block order for each group
+ const int even_low_sbs[] = {0, 2, 4, 6};
+ const int odd_low_sbs[] = {1, 3, 5, 7};
+ const int even_high_sbs[] = {8, 10, 12, 14};
+ const int odd_high_sbs[] = {9, 11, 13, 15};
+
+ // Pack Group 1: Even-Low
+ for (int sb : even_low_sbs) {
+ for (int col = 0; col < N_COLS; col++) {
+ out.scales[out_idx++] = in[col].scales[sb];
+ }
+ }
+
+ // Pack Group 2: Odd-Low
+ for (int sb : odd_low_sbs) {
+ for (int col = 0; col < N_COLS; col++) {
+ out.scales[out_idx++] = in[col].scales[sb];
+ }
+ }
+
+ // Pack Group 3: Even-High
+ for (int sb : even_high_sbs) {
+ for (int col = 0; col < N_COLS; col++) {
+ out.scales[out_idx++] = in[col].scales[sb];
+ }
+ }
+
+ // Pack Group 4: Odd-High
+ for (int sb : odd_high_sbs) {
+ for (int col = 0; col < N_COLS; col++) {
+ out.scales[out_idx++] = in[col].scales[sb];
+ }
+ }
+
+ return out;
+}
+
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
GGML_UNUSED(data_size);
}
+static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
+ constexpr int nrows_interleaved = 16;
+
+ block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
+ const block_q4_K * src = (const block_q4_K*) data;
+ block_q4_K dst_tmp[16];
+ int nrow = ggml_nrows(t);
+ int nblocks = t->ne[0] / QK_K;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
+
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++ ) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
+
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
GGML_ASSERT(interleave_block == 8);
GGML_UNUSED(data_size);
}
+static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
+ constexpr int nrows_interleaved = 16;
+
+ block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
+ const block_q2_K * src = (const block_q2_K*) data;
+
+ block_q2_K dst_tmp[nrows_interleaved];
+
+ int nrow = ggml_nrows(t);
+ int nblocks = t->ne[0] / QK_K;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
+
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ // This loop gathers 16 separate blocks (one from each column)
+ // that correspond to the same K-dimension chunk.
+ for (int i = 0; i < nrows_interleaved; i++ ) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+
+ *dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
+
+static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+ constexpr int nrows_interleaved = 16;
+
+ block_q4_0x16 * dst = (block_q4_0x16*)t->data;
+ const block_q4_0 * src = (const block_q4_0*) data;
+ block_q4_0 dst_tmp[16];
+ int nrow = ggml_nrows(t);
+ int nblocks = t->ne[0] / QK4_0;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++ ) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
+
static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
int interleave_block,
const void * GGML_RESTRICT data,
return 0;
}
+static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
+ block_q8_0x16 out;
+
+ for (int i = 0; i < 16; i++) {
+ out.d[i] = in[i].d;
+ }
+
+ const int end = QK8_0 * 16 / blck_size_interleave;
+
+ if (blck_size_interleave == 1) {
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 16;
+ int src_offset = i / 16;
+ int dst_offset = i;
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ return out;
+}
+
+static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
+ int interleave_block,
+ const void * GGML_RESTRICT data,
+ size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
+ constexpr int nrows_interleaved = 16;
+
+ block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
+ const block_q8_0 * src = (const block_q8_0 *) data;
+ block_q8_0 dst_tmp[16];
+ int nrow = ggml_nrows(t);
+ int nblocks = t->ne[0] / QK8_0;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
+
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+}
+
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
block_iq4_nlx4 out;
GGML_UNUSED(data_size);
}
+static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
+ block_iq4_nlx16 out;
+
+ for (int i = 0; i < 16; i++) {
+ out.d[i] = in[i].d;
+ }
+
+ const int end = QK4_NL * 8 / blck_size_interleave;
+
+ if (blck_size_interleave == 1) {
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 16;
+ int src_offset = i / 16;
+ int dst_offset = i;
+
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ return out;
+}
+
+static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+ GGML_ASSERT(interleave_block == 1);
+
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
+ block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
+
+ block_iq4_nl dst_tmp[16];
+
+ int nrow = ggml_nrows(t);
+ int nrows_interleaved = 16;
+ int nblocks = t->ne[0] / QK4_NL;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
+
+ if (t->ne[1] % nrows_interleaved != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
block_mxfp4x4 out;
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
}
+#if defined __riscv_zvfh
+template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
+ return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
+}
+
+template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
+ return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
+}
+
+template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
+ return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
+}
+
+template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
+ return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
+}
+
+template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
+ return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
+}
+#endif
+
// gemv
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
void gemv(int, float *, size_t, const void *, const void *, int, int);
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
+#if defined __riscv_zvfh
+template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+#endif
+
// gemm
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
void gemm(int, float *, size_t, const void *, const void *, int, int);
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
+#if defined __riscv_zvfh
+template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+ ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+#endif
+
class tensor_traits_base : public ggml::cpu::tensor_traits {
public:
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
+ // instances for RISC-V
+ //
+ // These implement outer-product style matrix multiplication kernels with
+ // an interleave of 1.
+#if defined __riscv_zvfh
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
+#endif
+
if (cur->type == GGML_TYPE_Q4_0) {
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
- || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
if (cur->ne[1] % 8 == 0) {
return &q4_0_8x8_q8_0;
}
return &q4_0_4x4_q8_0;
}
}
+ if (ggml_cpu_has_riscv_v()) {
+ #if defined __riscv_zvfh
+ switch (__riscv_vlenb() * 8) {
+ case 128: { break; } // TODO
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
+ case 512: { break; } // TODO
+ case 1024: { break; } // TODO
+ default: { return nullptr; }
+ }
+ #endif
+ }
} else if (cur->type == GGML_TYPE_Q4_K) {
if (ggml_cpu_has_avx2()) {
if (cur->ne[1] % 8 == 0) {
return &q4_K_8x4_q8_K;
}
}
+ if (ggml_cpu_has_riscv_v()) {
+ #if defined __riscv_zvfh
+ switch (__riscv_vlenb() * 8) {
+ case 128: { break; } // TODO
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
+ case 512: { break; } // TODO
+ case 1024: { break; } // TODO
+ default: { return nullptr; }
+ }
+ #endif
+ }
} else if (cur->type == GGML_TYPE_Q2_K) {
if (ggml_cpu_has_avx512()) {
if (cur->ne[1] % 8 == 0) {
return &q2_K_8x8_q8_K;
}
}
+ if (ggml_cpu_has_riscv_v()) {
+ #if defined __riscv_zvfh
+ switch (__riscv_vlenb() * 8) {
+ case 128: { break; } // TODO
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
+ case 512: { break; } // TODO
+ case 1024: { break; } // TODO
+ default: { return nullptr; }
+ }
+ #endif
+ }
} else if (cur->type == GGML_TYPE_Q5_K) {
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (cur->ne[1] % 8 == 0) {
return &iq4_nl_4x4_q8_0;
}
}
+ if (ggml_cpu_has_riscv_v()) {
+ #if defined __riscv_zvfh
+ switch (__riscv_vlenb() * 8) {
+ case 128: { break; } // TODO
+ case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
+ case 512: { break; } // TODO
+ case 1024: { break; } // TODO
+ default: { return nullptr; }
+ }
+ #endif
+ }
} else if (cur->type == GGML_TYPE_MXFP4) {
if (ggml_cpu_has_avx2()) {
if (cur->ne[1] % 8 == 0) {
return &q8_0_4x4_q8_0;
}
}
+ if (ggml_cpu_has_riscv_v()) {
+ #if defined __riscv_zvfh
+ switch (__riscv_vlenb() * 8) {
+ case 128: { break; } // TODO
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
+ case 512: { break; } // TODO
+ case 1024: { break; } // TODO
+ default: { return nullptr; }
+ }
+ #endif
+ }
}
return nullptr;