_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
+#elif defined(__riscv_v_intrinsic)
+
+ size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+ for (int i = 0; i < nb; i++) {
+ // load elements
+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+ const float d = amax / ((1 << 7) - 1);
+ const float id = d ? 1.0f/d : 0.0f;
+
+ y[i].d = GGML_FP32_TO_FP16(d);
+
+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+ // convert to integer
+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+ // store result
+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+ }
#else
// scalar
quantize_row_q8_0_reference(x, y, k);
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
+#elif defined(__riscv_v_intrinsic)
+
+ size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+ for (int i = 0; i < nb; i++) {
+ // load elements
+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+ const float d = amax / ((1 << 7) - 1);
+ const float id = d ? 1.0f/d : 0.0f;
+
+ y[i].d = d;
+
+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+ // convert to integer
+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+ // store result
+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+ // compute sum for y[i].s
+ vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+ vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+ // set y[i].s
+ int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+ y[i].s = sum*d;
+ }
#else
// scalar
quantize_row_q8_1_reference(x, y, k);
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
+ // load elements
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
- vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
- vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+ // mask and store lower part of x, and then upper part
+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
- vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
- vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
+ // subtract offset
+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
+ // load elements
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
- vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
- vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+ // mask and store lower part of x, and then upper part
+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
uint32_t qh;
- // These temp values are for masking and shift operations
- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
- uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
- 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
-
size_t vl = __riscv_vsetvl_e8m1(qk/2);
+ // These tempory registers are for masking and shift operations
+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+ vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+ vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+ vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
- // temporary registers
- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
- vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
- vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
- vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
-
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
// ((qh & (1u << (j + 16))) >> (j + 12));
- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
- vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+ vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
// narrowing
- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
// load
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
- vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
- vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
- vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
- vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
- vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
- vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
- vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
}
uint32_t qh;
- // These temp values are for shift operations
- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
size_t vl = __riscv_vsetvl_e8m1(qk/2);
+ // temporary registers for shift operations
+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+ vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
- // temporary registers
- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
- vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
-
// load qh
- vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
+ vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
// ((qh >> (j + 0)) << 4) & 0x10;
- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
// ((qh >> (j + 12)) ) & 0x10;
- vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
+ vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
// narrowing
- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
// load
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
- vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
- vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
- vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
- vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
- vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
#endif
#endif
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ float sumf = 0;
+ uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * q2 = x[i].qs;
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * sc = x[i].scales;
+
+ const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+
+ size_t vl = 16;
+
+ vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
+ vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
+
+ vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
+
+ vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
+ vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
+ vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+ vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
+ vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+
+ sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
+
+ vl = 32;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+ vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
+
+ uint8_t is=0;
+ int isum=0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+ // load Q2
+ vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
+
+ vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
+ vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
+ vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
+ vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
+
+ // duplicate scale elements for product
+ vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
+ vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
+ vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
+ vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
+
+ vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
+ vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
+ vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
+ vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
+
+ // load Q8
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+ vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
+ vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
+
+ vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
+ vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
+ vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
+ vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
+
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
+
+ isum += __riscv_vmv_x_s_i32m1_i32(isum1);
+
+ q2+=32; q8+=128; is=8;
+
+ }
+
+ sumf += dall * isum;
+
+ }
+
+ *s = sumf;
+
#else
float sumf = 0;
*s = hsum_float_8(acc) + summs;
+#elif defined __riscv_v_intrinsic
+
+ uint32_t aux32[2];
+ const uint8_t * scales = (const uint8_t *)aux32;
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * (float)x[i].d;
+ const float dmin = -y[i].d * (float)x[i].dmin;
+
+ const uint8_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+
+ aux32[0] = sc[0] & 0x0f0f0f0f;
+ aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
+
+ sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
+
+ int isum1 = 0;
+ int isum2 = 0;
+
+ size_t vl = 16;
+
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+ // load Q2
+ vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
+
+ vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
+ vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
+ vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
+ vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
+
+ // load Q8, and take product with Q2
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+ vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
+ vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
+ vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
+ vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
+
+ isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
+ isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
+ isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
+ isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
+
+ sumf += d * (isum1 + isum2);
+
+ }
+
+ *s = sumf;
+
#else
float sumf = 0;
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ uint32_t aux[3];
+ uint32_t utmp[4];
+
+ float sumf = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q3 = x[i].qs;
+ const uint8_t * restrict qh = x[i].hmask;
+ const int8_t * restrict q8 = y[i].qs;
+
+ memcpy(aux, x[i].scales, 12);
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+ int8_t * scale = (int8_t *)utmp;
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+
+ size_t vl = 32;
+ uint8_t m = 1;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+ vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
+
+ int sum_t = 0;
+
+ for (int j = 0; j < QK_K; j += 128) {
+
+ vl = 32;
+
+ // load Q3
+ vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
+
+ vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
+ vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
+ vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
+ vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
+
+ // compute mask for subtraction
+ vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+ m <<= 1;
+
+ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+ m <<= 1;
+
+ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+ m <<= 1;
+
+ vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+ m <<= 1;
+
+ // load Q8 and take product with Q3
+ vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
+ vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+ vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+ vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+ vl = 16;
+
+ // retreive lane to multiply with scale
+ vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
+ vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
+ vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
+ vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
+ vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
+ vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
+ vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
+ vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
+
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
+
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+ q3 += 32; q8 += 128; scale += 8;
+
+ }
+
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+
+ sumf += d*sum_t;
+
+ }
+
+ *s = sumf;
+
#else
// scalar version
// This function is written like this so the compiler can manage to vectorize most of it
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ uint16_t aux16[2];
+ int8_t * scales = (int8_t *)aux16;
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q3 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const uint16_t a = *(const uint16_t *)x[i].scales;
+ aux16[0] = a & 0x0f0f;
+ aux16[1] = (a >> 4) & 0x0f0f;
+
+ for (int j = 0; j < 4; ++j) scales[j] -= 8;
+
+ int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
+
+ const float d = y[i].d * (float)x[i].d;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ // load qh
+ vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
+ vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
+
+ size_t vl = 16;
+
+ // extend and combine both qh_x1 and qh_x2
+ vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
+
+ vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
+ vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
+ vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
+ vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
+
+ // load Q3
+ vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
+
+ vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
+ vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
+ vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
+ vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
+
+ vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
+ vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
+ vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
+ vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
+
+ // load Q8 and take product with Q3
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
+
+ sumf += d * isum;
+
+ }
+
+ *s = sumf;
+
#else
int8_t aux8[QK_K];
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+#elif defined __riscv_v_intrinsic
+
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ size_t vl = 8;
+
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+ const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+
+ vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+ vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+ vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
+ vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+ vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+ const uint8_t * restrict q4 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ vl = 32;
+
+ int32_t sum_1 = 0;
+ int32_t sum_2 = 0;
+
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+ for (int j = 0; j < QK_K/64; ++j) {
+ // load Q4
+ vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+ // load Q8 and multiply it with lower Q4 nibble
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+ vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+ vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
+ vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
+
+ sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
+
+ // load Q8 and multiply it with upper Q4 nibble
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+ vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+ vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
+ vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
+
+ sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
+
+ q4 += 32; q8 += 64;
+
+ }
+
+ sumf += d*(sum_1 + sum_2);
+
+ }
+
+ *s = sumf;
+
#else
*s = hsum_float_8(acc) - summs;
+#elif defined __riscv_v_intrinsic
+
+ uint16_t s16[2];
+ const uint8_t * restrict scales = (const uint8_t *)s16;
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q4 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+ s16[0] = b[0] & 0x0f0f;
+ s16[1] = (b[0] >> 4) & 0x0f0f;
+
+ sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
+
+ size_t vl = 32;
+
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+ // load Q4
+ vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+ // load Q8 and multiply it with lower Q4 nibble
+ vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+ vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
+ vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
+
+ sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
+
+ // load Q8 and multiply it with upper Q4 nibble
+ vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+ vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+ vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
+
+ sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
+
+ }
+
+ *s = sumf;
+
#else
uint8_t aux8[QK_K];
*s = hsum_float_8(acc) + summs;
+#elif defined __riscv_v_intrinsic
+
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
+
+ float sumf = 0;
+ float sums = 0.0;
+
+ size_t vl;
+
+ for (int i = 0; i < nb; ++i) {
+
+ vl = 8;
+
+ const uint8_t * restrict q5 = x[i].qs;
+ const uint8_t * restrict hm = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+ const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+
+ vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+ vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+ vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
+ vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+ vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+ vl = 32;
+ int32_t aux32 = 0;
+ int is = 0;
+
+ uint8_t m = 1;
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+ vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
+
+ for (int j = 0; j < QK_K/64; ++j) {
+ // load Q5 and Q8
+ vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
+ vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
+ vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
+
+ // compute mask for addition
+ vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
+ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+ m <<= 1;
+
+ vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
+ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+ vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+ m <<= 1;
+
+ vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
+ vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
+
+ vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
+ vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
+
+ vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
+ vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
+
+ aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
+ q5 += 32; q8 += 64;
+
+ }
+
+ vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
+ sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
+
+ }
+
+ *s = sumf+sums;
+
#else
const uint8_t * scales = (const uint8_t*)&utmp[0];
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * (float)x[i].d;
+ const int8_t * sc = x[i].scales;
+
+ const uint8_t * restrict q5 = x[i].qs;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ // load qh
+ vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
+ vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
+
+ size_t vl = 16;
+
+ // combine both qh_1 and qh_2
+ vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
+
+ vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
+ vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
+ vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
+ vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
+
+ vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
+ vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
+ vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
+ vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
+
+ // load q5
+ vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
+ vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
+
+ vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
+ vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
+ vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
+ vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
+
+ vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
+ vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
+ vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
+ vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
+
+ // load Q8 and multiply it with Q5
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+ int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
+ int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
+ int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
+ int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
+
+ sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
+
+ }
+
+ *s = sumf;
+
#else
int8_t aux8[QK_K];
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ float sumf = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+
+ const uint8_t * restrict q6 = x[i].ql;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const int8_t * restrict scale = x[i].scales;
+
+ size_t vl;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ int sum_t = 0;
+ int is = 0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ vl = 32;
+
+ // load qh
+ vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+ // load Q6
+ vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+ vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
+
+ vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+ vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+ vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+ vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+ vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+ vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
+ vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
+ vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
+
+ vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+ vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+ vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+ vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+ vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+ vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+ vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+ vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+ // load Q8 and take product
+ vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+ vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+ vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+ vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+ vl = 16;
+
+ vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
+ vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
+ vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
+ vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
+ vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
+ vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
+ vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
+ vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
+
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+ q6 += 64; qh += 32; q8 += 128; is=8;
+
+ }
+
+ sumf += d * sum_t;
+
+ }
+
+ *s = sumf;
+
#else
int8_t aux8[QK_K];
*s = hsum_float_8(acc);
+#elif defined __riscv_v_intrinsic
+
+ float sumf = 0;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d_all = (float)x[i].d;
+
+ const uint8_t * restrict q6 = x[i].ql;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const int8_t * restrict scale = x[i].scales;
+
+ int32_t isum = 0;
+
+ size_t vl = 16;
+
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ // load Q6
+ vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
+ vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
+
+ // load qh
+ vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
+
+ vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+ vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+ vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+ vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+
+ vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
+ vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
+ vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
+ vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
+
+ vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
+ vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
+ vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
+ vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
+
+ // load Q8 and take product
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
+
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
+
+ sumf += isum * d_all * y[i].d;
+
+ }
+
+ *s = sumf;
+
#else
int8_t aux8[QK_K];