static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
#endif
+#if defined(__loongarch_asx)
+
+#ifdef __clang__
+#define VREGS_PREFIX "$vr"
+#define XREGS_PREFIX "$xr"
+#else // GCC
+#define VREGS_PREFIX "$f"
+#define XREGS_PREFIX "$f"
+#endif
+#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+ __m256i out = __lasx_xvldi(0);
+ __asm__ volatile (
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " XREGS_PREFIX"\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " VREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ : [out] "+f" (out) : [in] "f" (in)
+ );
+ return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+ __m256i out;
+ __asm__ volatile (
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".ifnc %[out], %[hi] \n\t"
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
+ " xvori.b $xr\\i, $xr\\j, 0 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".endif \n\t"
+ : [out] "=f" (out), [hi] "+f" (inhi)
+ : [lo] "f" (inlo)
+ );
+ return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+ __m128i out;
+ __asm__ volatile (
+ ".ifnc %[out], %[in] \n\t"
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
+ " vori.b $vr\\i, $vr\\j, 0 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".endif \n\t"
+ : [out] "=f" (out) : [in] "f" (in)
+ );
+ return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+ __m128i out;
+ __asm__ volatile (
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ : [out] "=f" (out) : [in] "f" (in)
+ );
+ return out;
+}
+
+static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
+ v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
+ return (__m256i)__ret;
+}
+
+static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
+ v4i32 __ret = {d, c, b, a};
+ return (__m128i)__ret;
+}
+
+static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
+ v4i64 __ret = {d, c, b, a};
+ return (__m256i)__ret;
+}
+
+static __m256i lasx_insertf128( __m128i x, __m128i y) {
+ return lasx_set_q(x, y);
+}
+
+static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+ __m128i mask_f, zero, tmp0, tmp2, mask;
+ int f = 0x8f;
+ mask_f = __lsx_vreplgr2vr_b(f);
+ zero = __lsx_vldi(0);
+ tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
+ tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
+ mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
+ tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
+ return __lsx_vshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+ __m256i mask_f, zero, tmp0, tmp2, mask;
+ int f = 0x8f;
+ mask_f = __lasx_xvreplgr2vr_b(f);
+ zero = __lasx_xvldi(0);
+ tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
+ tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
+ mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
+ tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
+ return __lasx_xvshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_extu8_16(__m128i a) {
+ __m128i zero = __lsx_vldi(0);
+ __m128i vlo = __lsx_vilvl_b(zero, a);
+ __m128i vhi = __lsx_vilvh_b(zero, a);
+ return lasx_set_q(vhi, vlo);
+}
+
+static __m256i lasx_ext8_16(__m128i a) {
+ __m128i sign = __lsx_vslti_b(a, 0);
+ __m128i vlo = __lsx_vilvl_b(sign, a);
+ __m128i vhi = __lsx_vilvh_b(sign, a);
+ return lasx_set_q(vhi, vlo);
+}
+
+static __m256i lasx_ext16_32(__m128i a) {
+ __m256i tmp1;
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
+ tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
+ return tmp1;
+}
+
+static __m128i lasx_extracti128( __m256i a, int pos) {
+ __m128i ret;
+ if( pos == 0)
+ {
+ ret = lasx_extracti128_lo(a);
+ } else {
+ ret = lasx_extracti128_hi(a);
+ }
+ return ret;
+}
+
+static __m128 lasx_extractf128( __m256 a, int pos) {
+ __m128 ret;
+ if( pos == 0)
+ {
+ ret = (__m128)lasx_extracti128_lo((__m256i)a);
+ } else {
+ ret = (__m128)lasx_extracti128_hi((__m256i)a);
+ }
+ return ret;
+}
+
+static __m128i lsx_hadd_h(__m128i a, __m128i b) {
+ __m128i tmp1 = __lsx_vpickev_h(b, a);
+ __m128i tmp2 = __lsx_vpickod_h(b, a);
+ return __lsx_vadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_hadd_w(__m128i a, __m128i b) {
+ __m128i tmp1 = __lsx_vpickev_w(b, a);
+ __m128i tmp2 = __lsx_vpickod_w(b, a);
+ return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128 lsx_hadd_s(__m128 a, __m128 b) {
+ __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
+ __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
+
+ return __lsx_vfadd_s(tmp1, tmp2);
+}
+
+static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
+ __m256i tmp1, tmp2;
+ tmp1 = __lasx_xvmulwev_h_b(a, b);
+ tmp2 = __lasx_xvmulwod_h_b(a, b);
+ return __lasx_xvsadd_h(tmp1, tmp2);
+}
+
+static __m256i lasx_madd_h(__m256i a, __m256i b) {
+ __m256i tmp1, tmp2;
+ tmp1 = __lasx_xvmulwev_w_h(a, b);
+ tmp2 = __lasx_xvmulwod_w_h(a, b);
+ return __lasx_xvadd_w(tmp1, tmp2);
+}
+
+static __m256i lasx_packs_w(__m256i a, __m256i b) {
+ __m256i tmp, tmp1;
+ tmp = __lasx_xvsat_w(a, 15);
+ tmp1 = __lasx_xvsat_w(b, 15);
+ return __lasx_xvpickev_h(tmp1, tmp);
+}
+
+static __m256i lasx_packs_h(__m256i a, __m256i b) {
+ __m256i tmp, tmp1;
+ tmp = __lasx_xvsat_h(a, 7);
+ tmp1 = __lasx_xvsat_h(b, 7);
+ return __lasx_xvpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packs_w(__m128i a, __m128i b) {
+ __m128i tmp, tmp1;
+ tmp = __lsx_vsat_w(a, 15);
+ tmp1 = __lsx_vsat_w(b, 15);
+ return __lsx_vpickev_h(tmp1, tmp);
+}
+
+static __m128i lsx_packs_h(__m128i a, __m128i b) {
+ __m128i tmp, tmp1;
+ tmp = __lsx_vsat_h(a, 7);
+ tmp1 = __lsx_vsat_h(b, 7);
+ return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packus_h(__m128i a, __m128i b) {
+ __m128i tmp, tmp1;
+ tmp = __lsx_vsat_hu(a, 7);
+ tmp1 = __lsx_vsat_hu(b, 7);
+ return __lsx_vpickev_b(tmp1, tmp);
+}
+
+
+static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
+ __m128i tmp1, tmp2;
+ tmp1 = __lsx_vmulwev_h_b(a, b);
+ tmp2 = __lsx_vmulwod_h_b(a, b);
+ return __lsx_vsadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_madd_h(__m128i a, __m128i b) {
+ __m128i tmp1, tmp2;
+ tmp1 = __lsx_vmulwev_w_h(a, b);
+ tmp2 = __lsx_vmulwod_w_h(a, b);
+ return __lsx_vadd_w(tmp1, tmp2);
+}
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+ // Get absolute values of x vectors
+ const __m128i ax = __lsx_vsigncov_b(x, x);
+ // Sign the values of the y vectors
+ const __m128i sy = __lsx_vsigncov_b(x, y);
+ // Perform multiplication and create 16-bit values
+ const __m128i dot = lsx_maddubs_h(ax, sy);
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
+ return lsx_madd_h(ones, dot);
+}
+
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+ __m128 res = lasx_extractf128(x, 1);
+ ft_union tmp;
+ res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
+ res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
+ res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
+ tmp.i = __lsx_vpickve2gr_w(res, 0);
+ return tmp.f;
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+
+ __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
+ __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
+
+ __m128i tmp1_128 = lasx_extracti128_lo(tmp1);
+ __m128i tmp2_128 = lasx_extracti128_lo(tmp2);
+
+ __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
+
+ __m128i ev = __lsx_vpickev_w(sum128, sum128);
+ __m128i od = __lsx_vpickod_w(sum128, sum128);
+ __m128i sum64 = __lsx_vadd_w(ev, od);
+
+ int sum64_1, sum64_2;
+ sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+ sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+ return sum64_1 + sum64_2;
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+ __m128i ev = __lsx_vpickev_w(a, a);
+ __m128i od = __lsx_vpickod_w(a, a);
+ __m128i sum64 = __lsx_vadd_w(ev, od);
+
+ int sum64_1, sum64_2;
+ sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+ sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+ return sum64_1 + sum64_2;
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+
+ uint32_t x32;
+ memcpy(&x32, x, sizeof(uint32_t));
+ const __m256i shuf_mask = lasx_set_d(
+ 0x0303030303030303, 0x0202020202020202,
+ 0x0101010101010101, 0x0000000000000000);
+
+ __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
+ const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
+ bytes = __lasx_xvor_v(bytes, bit_mask);
+ return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+ const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
+ __m128i hi = __lsx_vsrli_h(lo, 4);
+ return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+ __m256i v = __lasx_xvpackod_h(x, x);
+ __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
+ return __lasx_xvffint_s_w(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+ // Perform multiplication and create 16-bit values
+ const __m256i dot = lasx_maddubs_h(ax, sy);
+ return sum_i16_pairs_float(dot);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+
+ // Get absolute values of x vectors
+ const __m256i ax = __lasx_xvsigncov_b(x, x);
+ // Sign the values of the y vectors
+ const __m256i sy = __lasx_xvsigncov_b(x, y);
+
+ return mul_sum_us8_pairs_float(ax, sy);
+}
+
+static inline __m128i packNibbles( __m256i bytes ) {
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+ const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
+ __m256i high = __lasx_xvandn_v(lowByte, bytes);
+ __m256i low = __lasx_xvand_v(lowByte, bytes);
+ high = __lasx_xvsrli_h(high, 4);
+ bytes = __lasx_xvor_v(low, high);
+ // Compress uint16_t lanes into bytes
+ __m128i *r0 = (__m128i *)&bytes;
+ __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
+ __m128i *r1 = (__m128i *)&tmp_h128;
+
+ __m128i zero = __lsx_vldi(0);
+ __m128i tmp, tmp2, tmp3;
+
+ tmp = __lsx_vmax_h(zero, *r0);
+ tmp2 = __lsx_vsat_hu(tmp, 7);
+
+ tmp = __lsx_vmax_h(zero, *r1);
+ tmp3 = __lsx_vsat_hu(tmp, 7);
+ return __lsx_vpickev_b(tmp3, tmp2);
+}
+#endif //__loongarch_asx
+
// reference implementation for deterministic creation of model files
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
static const int qk = QK4_0;
// store result
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
}
+
#elif defined(__POWER9_VECTOR__)
for (int i = 0; i < nb; i++) {
vector float srcv [8];
}
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+
+#elif defined(__loongarch_asx)
+ for (int i = 0; i < nb; i++) {
+ ft_union fi;
+ __m256 v0 = (__m256)__lasx_xvld( x , 0);
+ __m256 v1 = (__m256)__lasx_xvld( x , 32);
+ __m256 v2 = (__m256)__lasx_xvld( x , 64);
+ __m256 v3 = (__m256)__lasx_xvld( x , 96);
+ x += 32;
+
+ // Compute max(abs(e)) for the block
+ const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+ __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+ __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+ __m128 tmp = max4;
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
+ fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
+ const float max_scalar = fi.f;
+
+ // Quantize these floats
+ const float d = max_scalar / 127.f;
+ y[i].d = GGML_FP32_TO_FP16(d);
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+ const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
+
+ // Apply the multiplier
+ v0 = __lasx_xvfmul_s( v0, mul );
+ v1 = __lasx_xvfmul_s( v1, mul );
+ v2 = __lasx_xvfmul_s( v2, mul );
+ v3 = __lasx_xvfmul_s( v3, mul );
+
+ // Round to nearest integer
+ __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+ __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+ __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+ __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+ __m128i ni0 = lasx_extracti128( i0, 0 );
+ __m128i ni1 = lasx_extracti128( i0, 1);
+ __m128i ni2 = lasx_extracti128( i1, 0);
+ __m128i ni3 = lasx_extracti128( i1, 1);
+ __m128i ni4 = lasx_extracti128( i2, 0);
+ __m128i ni5 = lasx_extracti128( i2, 1);
+ __m128i ni6 = lasx_extracti128( i3, 0);
+ __m128i ni7 = lasx_extracti128( i3, 1);
+
+ // Convert int32 to int16
+ ni0 = lsx_packs_w( ni0, ni1 );
+ ni2 = lsx_packs_w( ni2, ni3 );
+ ni4 = lsx_packs_w( ni4, ni5 );
+ ni6 = lsx_packs_w( ni6, ni7 );
+ // Convert int16 to int8
+ ni0 = lsx_packs_h( ni0, ni2 );
+ ni4 = lsx_packs_h( ni4, ni6 );
+
+ __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
+ __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+
}
#else
GGML_UNUSED(nb);
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
- const float maxScalar = _mm_cvtss_f32( max4 );
+ const float max_scalar = _mm_cvtss_f32( max4 );
// Quantize these floats
- const float d = maxScalar / 127.f;
+ const float d = max_scalar / 127.f;
y[i].d = GGML_FP32_TO_FP16(d);
- const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
y[i].s = GGML_FP32_TO_FP16(sum*d);
}
+
#elif defined(__POWER9_VECTOR__)
for (int i = 0; i < nb; i++) {
vector float srcv [8];
accv = vec_add(accv, vec_sld(accv, accv, 4));
accv = vec_add(accv, vec_sld(accv, accv, 8));
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
+
+#elif defined(__loongarch_asx)
+ for (int i = 0; i < nb; i++) {
+ ft_union ft;
+ __m256 v0 = (__m256)__lasx_xvld( x , 0 );
+ __m256 v1 = (__m256)__lasx_xvld( x , 32 );
+ __m256 v2 = (__m256)__lasx_xvld( x , 64 );
+ __m256 v3 = (__m256)__lasx_xvld( x , 96 );
+ x += 32;
+
+ // Compute max(abs(e)) for the block
+ const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+ __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+ __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+ __m128 tmp = max4;
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
+ ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
+ const float max_scalar = ft.f;
+
+ // Quantize these floats
+ const float d = max_scalar / 127.f;
+ y[i].d = GGML_FP32_TO_FP16(d);
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+ const __m256 mul = __lasx_xvreplfr2vr_s( id );
+
+ // Apply the multiplier
+ v0 = __lasx_xvfmul_s( v0, mul );
+ v1 = __lasx_xvfmul_s( v1, mul );
+ v2 = __lasx_xvfmul_s( v2, mul );
+ v3 = __lasx_xvfmul_s( v3, mul );
+
+ // Round to nearest integer
+ __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+ __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+ __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+ __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+ __m128i ni0 = lasx_extracti128(i0, 0);
+ __m128i ni1 = lasx_extracti128( i0, 1);
+ __m128i ni2 = lasx_extracti128( i1, 0);
+ __m128i ni3 = lasx_extracti128( i1, 1);
+ __m128i ni4 = lasx_extracti128( i2, 0 );
+ __m128i ni5 = lasx_extracti128( i2, 1);
+ __m128i ni6 = lasx_extracti128( i3, 0);
+ __m128i ni7 = lasx_extracti128( i3, 1);
+
+ // Compute the sum of the quants and set y[i].s
+ const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
+ const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
+
+ // Convert int32 to int16
+ ni0 = lsx_packs_w( ni0, ni1 );
+ ni2 = lsx_packs_w( ni2, ni3 );
+ ni4 = lsx_packs_w( ni4, ni5 );
+ ni6 = lsx_packs_w( ni6, ni7 );
+ // Convert int16 to int8
+ ni0 = lsx_packs_h( ni0, ni2 );
+ ni4 = lsx_packs_h( ni4, ni6 );
+
+ __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
+ __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
}
#else
GGML_UNUSED(nb);
};
return _mm_loadu_si128((const __m128i*)k_shuffle + i);
}
+#elif defined(__loongarch_asx)
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+ static const uint8_t k_shuffle[128] = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+ };
+ return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+ static const uint8_t k_shuffle[256] = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+ 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+ 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+ };
+ return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m128i get_scale_shuffle(int i) {
+ static const uint8_t k_shuffle[128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+ 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+ 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+ };
+ return __lsx_vld((const __m128i*)k_shuffle + i, 0);
+}
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
}
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
-#else
- // scalar
- float sumf = 0.0;
- for (int i = 0; i < nb; i++) {
- int sumi = 0;
+#elif defined(__loongarch_asx)
+ // Initialize accumulator with zeros
+ __m256 acc = (__m256)__lasx_xvldi(0);
- for (int j = 0; j < qk/2; ++j) {
- const int v0 = (x[i].qs[j] & 0x0F) - 8;
- const int v1 = (x[i].qs[j] >> 4) - 8;
+ // Main loop
+ for (int i = 0; i < nb; ++i) {
+ /* Compute combined scale for the block */
+ const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
- sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
- }
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
- sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
- }
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+ const __m256i off = __lasx_xvreplgr2vr_b( 8 );
+ qx = __lasx_xvsub_b( qx, off );
- *s = sumf;
-#endif
-}
+ __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
- const int qk = QK8_1;
- const int nb = n / qk;
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
- assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
- assert((nrc == 2) || (nrc == 1));
-#else
- assert(nrc == 1);
-#endif
- UNUSED(nrc);
- UNUSED(bx);
- UNUSED(by);
- UNUSED(bs);
+ /* Multiply q with scale and accumulate */
+ acc = __lasx_xvfmadd_s( d, q, acc );
+ }
- const block_q4_1 * restrict x = vx;
- const block_q8_1 * restrict y = vy;
+ *s = hsum_float_8(acc);
+#elif defined(__loongarch_sx)
+ // set constants
+ const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
+ const __m128i off = __lsx_vreplgr2vr_b(8);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
- if (nrc == 2) {
- const block_q4_1 * restrict vx0 = vx;
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
- const block_q8_1 * restrict vy0 = vy;
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+ // Initialize accumulator with zeros
+ __m128 acc_0 = __lsx_vldi(0);
+ __m128 acc_1 = __lsx_vldi(0);
+ __m128 acc_2 = __lsx_vldi(0);
+ __m128 acc_3 = __lsx_vldi(0);
- float32x4_t sumv0 = vdupq_n_f32(0.0f);
- float32x4_t summs0 = vdupq_n_f32(0.0f);
+ // First round without accumulation
+ {
+ _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+ _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
- for (int i = 0; i < nb; i++) {
- const block_q4_1 * restrict b_x0 = &vx0[i];
+ // Compute combined scale for the block 0 and 1
+ const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
+
+ const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[0].qs, 0);
+
+ __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
+ __m128i by_0 = __lsx_vld((const __m128i *)y[0].qs, 0);
+ bx_0 = __lsx_vsub_b(bx_0, off);
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+ __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
+ __m128i by_1 = __lsx_vld((const __m128i *)(y[0].qs + 16), 0);
+ bx_1 = __lsx_vsub_b(bx_1, off);
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+ // Compute combined scale for the block 2 and 3
+ const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
+
+ const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[1].qs, 0);
+
+ __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
+ __m128i by_2 = __lsx_vld((const __m128i *)y[1].qs, 0);
+ bx_2 = __lsx_vsub_b(bx_2, off);
+ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+ __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
+ __m128i by_3 = __lsx_vld((const __m128i *)(y[1].qs + 16), 0);
+ bx_3 = __lsx_vsub_b(bx_3, off);
+ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+ // Convert int32_t to float
+ __m128 p0 = __lsx_vffint_s_w(i32_0);
+ __m128 p1 = __lsx_vffint_s_w(i32_1);
+ __m128 p2 = __lsx_vffint_s_w(i32_2);
+ __m128 p3 = __lsx_vffint_s_w(i32_3);
+
+ // Apply the scale
+ acc_0 = __lsx_vfmul_s( d_0_1, p0 );
+ acc_1 = __lsx_vfmul_s( d_0_1, p1 );
+ acc_2 = __lsx_vfmul_s( d_2_3, p2 );
+ acc_3 = __lsx_vfmul_s( d_2_3, p3 );
+ }
+
+ assert(nb % 2 == 0); // TODO: handle odd nb
+
+ // Main loop
+ for (int i = 2; i < nb; i+=2) {
+
+ // Compute combined scale for the block 0 and 1
+ const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+ const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[i].qs, 0);
+
+ __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
+ __m128i by_0 = __lsx_vld((const __m128i *)y[i].qs, 0);
+ bx_0 = __lsx_vsub_b(bx_0, off);
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+ __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
+ __m128i by_1 = __lsx_vld((const __m128i *)(y[i].qs + 16), 0);
+ bx_1 = __lsx_vsub_b(bx_1, off);
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+ //_mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+ //_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+ // Compute combined scale for the block 2 and 3
+ const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
+
+ const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[i + 1].qs, 0);
+
+ __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
+ __m128i by_2 = __lsx_vld((const __m128i *)y[i + 1].qs, 0);
+ bx_2 = __lsx_vsub_b(bx_2, off);
+ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+ __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
+ __m128i by_3 = __lsx_vld((const __m128i *)(y[i + 1].qs + 16), 0);
+ bx_3 = __lsx_vsub_b(bx_3, off);
+ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+ // Convert int32_t to float
+ __m128 p0 = __lsx_vffint_s_w(i32_0);
+ __m128 p1 = __lsx_vffint_s_w(i32_1);
+ __m128 p2 = __lsx_vffint_s_w(i32_2);
+ __m128 p3 = __lsx_vffint_s_w(i32_3);
+
+ // Apply the scale
+ __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
+ __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
+ __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
+ __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
+
+ // Acummulate
+ acc_0 = __lsx_vfadd_s(p0_d, acc_0);
+ acc_1 = __lsx_vfadd_s(p1_d, acc_1);
+ acc_2 = __lsx_vfadd_s(p2_d, acc_2);
+ acc_3 = __lsx_vfadd_s(p3_d, acc_3);
+ }
+
+ *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+
+#else
+ // scalar
+ float sumf = 0.0;
+
+ for (int i = 0; i < nb; i++) {
+ int sumi = 0;
+
+ for (int j = 0; j < qk/2; ++j) {
+ const int v0 = (x[i].qs[j] & 0x0F) - 8;
+ const int v1 = (x[i].qs[j] >> 4) - 8;
+
+ sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+ }
+
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+ }
+
+ *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+ const int qk = QK8_1;
+ const int nb = n / qk;
+
+ assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+ assert((nrc == 2) || (nrc == 1));
+#else
+ assert(nrc == 1);
+#endif
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q4_1 * restrict x = vx;
+ const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+ if (nrc == 2) {
+ const block_q4_1 * restrict vx0 = vx;
+ const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+ const block_q8_1 * restrict vy0 = vy;
+ const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
+ float32x4_t summs0 = vdupq_n_f32(0.0f);
+
+ for (int i = 0; i < nb; i++) {
+ const block_q4_1 * restrict b_x0 = &vx0[i];
const block_q4_1 * restrict b_x1 = &vx1[i];
const block_q8_1 * restrict b_y0 = &vy0[i];
const block_q8_1 * restrict b_y1 = &vy1[i];
}
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+ // Initialize accumulator with zeros
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ float summs = 0;
+
+ // Main loop
+ for (int i = 0; i < nb; ++i) {
+ const float d0 = GGML_FP16_TO_FP32(x[i].d);
+ const float d1 = GGML_FP16_TO_FP32(y[i].d);
+
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
+
+ const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
+ const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
+
+ // Compute combined scales
+ const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
+
+ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+ const __m256i qx = bytes_from_nibbles_32(x[i].qs);
+ const __m256i qy = __lasx_xvld( (const __m256i *)y[i].qs, 0);
+
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+ // Accumulate d0*d1*x*y
+ acc = __lasx_xvfmadd_s( d0d1, xy, acc );
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
#else
// scalar
float sumf = 0.0;
}
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)4);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+ // Initialize accumulator with zeros
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ // Main loop
+ for (int i = 0; i < nb; i++) {
+ /* Compute combined scale for the block */
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); //FIXME
+
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
+ __m256i bxhi = bytes_from_bits_32(x[i].qh);
+ bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
+ qx = __lasx_xvor_v(qx, bxhi);
+
+ __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0);
+
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+ /* Multiply q with scale and accumulate */
+ acc = __lasx_xvfmadd_s(d, q, acc);
+ }
+
+ *s = hsum_float_8(acc);
+
#else
// scalar
float sumf = 0.0;
}
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+ // Initialize accumulator with zeros
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ float summs = 0.0f;
+
+ // Main loop
+ for (int i = 0; i < nb; i++) {
+ const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[i].d));
+
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
+
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
+ __m256i bxhi = bytes_from_bits_32(x[i].qh);
+ bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
+ qx = __lasx_xvor_v(qx, bxhi);
+
+ const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[i].d));
+ const __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0);
+
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+ acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
#else
// scalar
float sumf = 0.0;
}
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
vector float vsumf0 = vec_splats(0.0f);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+ // Initialize accumulator with zeros
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ // Main loop
+ for (int i = 0; i < nb; ++i) {
+ // Compute combined scale for the block
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+ __m256i qx = __lasx_xvld((const __m256i *)x[i].qs, 0);
+ __m256i qy = __lasx_xvld((const __m256i *)y[i].qs, 0);
+
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+ // Multiply q with scale and accumulate
+ acc = __lasx_xvfmadd_s( d, q, acc );
+ }
+
+ *s = hsum_float_8(acc);
+
#else
// scalar
float sumf = 0.0;
vector signed int vsumi6 = vec_splats((int32_t)0);
vector signed int vsumi7 = vec_splats((int32_t)0);
- const uint8_t * restrict q2 = x[i].qs;
- const int8_t * restrict q8 = y[i].qs;
for (int j = 0; j < QK_K/128; ++j) {
__builtin_prefetch(q2, 0, 1);
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+ const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+ const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+ const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
+ const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
+ const __m256i mins = lasx_ext8_16(mins8);
+ const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
+
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
+
+ const __m256i all_scales = lasx_ext8_16(scales8);
+ const __m128i l_scales = lasx_extracti128(all_scales, 0);
+ const __m128i h_scales = lasx_extracti128(all_scales, 1);
+ const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+ __m256i sumi = __lasx_xvldi(0);
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+ const __m256i q2_0 = __lasx_xvand_v(q2bits, m3);
+ const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3);
+ const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3);
+ const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3);
+
+ __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
+ __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
+ __m256i p2 = lasx_maddubs_h(q2_2, q8_2);
+ __m256i p3 = lasx_maddubs_h(q2_3, q8_3);
+
+ p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0);
+ p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1);
+ p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2);
+ p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+ p0 = __lasx_xvadd_w(p0, p1);
+ p2 = __lasx_xvadd_w(p2, p3);
+
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
+ }
+
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
#else
float sumf = 0;
*s = sumf;
+
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0x3);
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ uint32_t ud, um;
+ const uint8_t * restrict db = (const uint8_t *)&ud;
+ const uint8_t * restrict mb = (const uint8_t *)&um;
+
+ float summs = 0;
+
+ // TODO: optimize this
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+ ud = (sc[0] >> 0) & 0x0f0f0f0f;
+ um = (sc[0] >> 4) & 0x0f0f0f0f;
+
+ int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
+ summs += dmin * smin;
+
+ const __m128i q2bits = __lsx_vld((const __m128i*)q2, 0);
+ const __m256i q2_0 = __lasx_xvand_v(lasx_insertf128(__lsx_vsrli_h(q2bits, 2), q2bits), m3);
+ const __m256i q2_1 = __lasx_xvand_v(lasx_insertf128(__lsx_vsrli_h(q2bits, 6), __lsx_vsrli_h(q2bits, 4)), m3);
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0);
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0);
+
+ const __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
+ const __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
+
+ const __m256i p_0 = lasx_ext16_32(lasx_extracti128(p0, 0));
+ const __m256i p_1 = lasx_ext16_32(lasx_extracti128(p0, 1));
+ const __m256i p_2 = lasx_ext16_32(lasx_extracti128(p1, 0));
+ const __m256i p_3 = lasx_ext16_32(lasx_extracti128(p1, 1));
+
+ ft_union t0, t1, t2, t3;
+ t0.f = d * db[0];
+ t1.f = d * db[1];
+ t2.f = d * db[2];
+ t3.f = d * db[3];
+ acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t0.i), __lasx_xvffint_s_w(p_0), acc);
+ acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t1.i), __lasx_xvffint_s_w(p_1), acc);
+ acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t2.i), __lasx_xvffint_s_w(p_2), acc);
+ acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t3.i), __lasx_xvffint_s_w(p_3), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
#else
float sumf = 0;
vector signed int vsumi6 = vec_splats((int32_t)0);
vector signed int vsumi7 = vec_splats((int32_t)0);
+
const uint8_t * restrict q3 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
-#else
- // scalar version
- // This function is written like this so the compiler can manage to vectorize most of it
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
- // The ideal situation would be if we could just write the code once, and the compiler would
- // automatically produce the best possible set of machine instructions, instead of us having to manually
- // write vectorized versions for AVX, ARM_NEON, etc.
- int8_t aux8[QK_K];
+#elif defined __loongarch_asx
+
+ const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+ const __m256i mone = __lasx_xvreplgr2vr_b(1);
+ const __m128i m32 = __lsx_vreplgr2vr_b(32);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ uint32_t aux[3];
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ // Set up scales
+ memcpy(aux, x[i].scales, 12);
+ __m128i scales128 = lsx_set_w(
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+ scales128 = __lsx_vsub_b(scales128, m32);
+ const __m256i all_scales = lasx_ext8_16(scales128);
+ const __m128i l_scales = lasx_extracti128(all_scales, 0);
+ const __m128i h_scales = lasx_extracti128(all_scales, 1);
+ const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+ // high bit
+ const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
+
+ // integer accumulator
+ __m256i sumi = __lasx_xvldi(0);
+
+ int bit = 0;
+ int is = 0;
+
+ const uint8_t * restrict q3 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+ // load low 2 bits
+ const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
+
+ // prepare low and high bits
+ const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
+ const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
+ const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
+ const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
+ const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+ ++bit;
+
+ // load Q8 quants
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+ // and 2 if the high bit was set)
+ __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
+ __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
+ __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
+ __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
+
+ __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
+ __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
+ __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
+ __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
+
+ p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+ p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+ p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+ p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+ // multiply with scales
+ p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+ p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+ p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+ p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+ // accumulate
+ p16_0 = __lasx_xvadd_w(p16_0, p16_1);
+ p16_2 = __lasx_xvadd_w(p16_2, p16_3);
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
+ }
+ // multiply with block scale and accumulate
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME
+ }
+
+ *s = hsum_float_8(acc);
+
+#else
+ // scalar version
+ // This function is written like this so the compiler can manage to vectorize most of it
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+ // The ideal situation would be if we could just write the code once, and the compiler would
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
+ // write vectorized versions for AVX, ARM_NEON, etc.
+
+ int8_t aux8[QK_K];
int16_t aux16[8];
float sums [8];
int32_t aux32[8];
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+ const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+ const __m256i m1 = __lasx_xvreplgr2vr_b(1);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ uint64_t aux64;
+
+ uint16_t aux16[2];
+ const int8_t * aux8 = (const int8_t *)aux16;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * restrict q3 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+ const __m256i scale_0 = lasx_insertf128(__lasx_xvreplgr2vr_h(aux8[2] - 8), __lasx_xvreplgr2vr_h(aux8[0] - 8));
+ const __m256i scale_1 = lasx_insertf128(__lasx_xvreplgr2vr_h(aux8[3] - 8), __lasx_xvreplgr2vr_h(aux8[1] - 8));
+
+ memcpy(&aux64, x[i].hmask, 8);
+
+ __m128i haux = __lsx_vinsgr2vr_d(haux, aux64, 0);
+ haux = __lsx_vinsgr2vr_d(haux, aux64 >> 1, 1);
+ __m256i q3h_0 = lasx_insertf128(__lsx_vsrli_h(haux, 2), haux);
+ __m256i q3h_1 = __lasx_xvsrli_h(q3h_0, 4);
+ q3h_0 = __lasx_xvslli_h(__lasx_xvandn_v(q3h_0, m1), 2);
+ q3h_1 = __lasx_xvslli_h(__lasx_xvandn_v(q3h_1, m1), 2);
+
+ // load low 2 bits
+ const __m128i q3bits = __lsx_vld((const __m128i*)q3, 0);
+
+ // prepare low and high bits
+ const __m256i q3aux = lasx_insertf128(__lsx_vsrli_h(q3bits, 2), q3bits);
+ const __m256i q3l_0 = __lasx_xvand_v(q3aux, m3);
+ const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3aux, 4), m3);
+
+ // load Q8 quants
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0);
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0);
+
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+ // and 2 if the high bit was set)
+ const __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
+ const __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
+
+ __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
+ __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
+
+ p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+ p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+
+ // multiply with scales
+ p16_0 = lasx_madd_h(scale_0, p16_0);
+ p16_1 = lasx_madd_h(scale_1, p16_1);
+
+ p16_0 = __lasx_xvadd_w(p16_0, p16_1);
+
+ // multiply with block scale and accumulate
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(p16_0), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
*s = vec_extract(vsumf0, 0);
-#else
+#elif defined __loongarch_asx
+
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+ __m128 acc_m = (__m128)__lsx_vldi(0);
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+ memcpy(utmp, x[i].scales, 12);
+
+ const uint8_t * restrict q4 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+ const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+ const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+ const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+ acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
+
+ const __m128i sc128 = lasx_extracti128(mins_and_scales, 0);
+ const __m256i scales = lasx_insertf128(sc128, sc128);
+
+ __m256i sumi = __lasx_xvldi(0);
+
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+ const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+ const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+ const __m256i q4l = __lasx_xvand_v(q4bits, m4);
+ const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
+
+ const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ __m256i p16l = lasx_maddubs_h(q4l, q8l);
+ p16l = lasx_madd_h(scale_l, p16l);
+
+ const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ __m256i p16h = lasx_maddubs_h(q4h, q8h);
+ p16h = lasx_madd_h(scale_h, p16h);
+ const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
+
+ sumi = __lasx_xvadd_w(sumi, sumj);
+ }
+
+ __m256 vd = __lasx_xvreplfr2vr_s(d);
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+ }
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
+ __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
+
+ ft_union fi;
+ fi.i = __lsx_vpickve2gr_w(acc_m, 0);
+ *s = hsum_float_8(acc) + fi.f ;
+
+#else
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ float summs = 0;
+
+ uint16_t aux16[2];
+ const uint8_t * scales = (const uint8_t *)aux16;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+ const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
+ const __m256 vd = __lasx_xvreplfr2vr_s(d);
+
+ const uint16_t * a = (const uint16_t *)x[i].scales;
+ aux16[0] = a[0] & 0x0f0f;
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+ summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+ const uint8_t * restrict q4 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0);
+ const __m256i q4l = __lasx_xvand_v(q4bits, m4);
+ const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
+
+ const __m256i q8l = __lasx_xvld((const __m256i*)(q8+ 0), 0);
+ const __m256i q8h = __lasx_xvld((const __m256i*)(q8+32), 0);
+
+ const __m256i p16l = lasx_maddubs_h(q4l, q8l);
+ const __m256i p16h = lasx_maddubs_h(q4h, q8h);
+
+ const __m256i p32l = lasx_madd_h(__lasx_xvreplgr2vr_h(scales[0]), p16l);
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(p32l), acc);
+
+ const __m256i p32h = lasx_madd_h(__lasx_xvreplgr2vr_h(scales[1]), p16h);
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(p32h), acc);
+ }
+
+ *s = hsum_float_8(acc) - summs;
+
#else
uint8_t aux8[QK_K];
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+ const __m128i mzero = __lsx_vldi(0);
+ const __m256i mone = __lasx_xvreplgr2vr_b(1);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ float summs = 0.f;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q5 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+#if QK_K == 256
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+ memcpy(utmp, x[i].scales, 12);
+#else
+ // TODO
+ const float d = 0, dmin = 0;
+#endif
+
+ const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+ const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+ const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+ const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+ const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero);
+ summs += dmin * __lsx_vpickve2gr_w(hsum, 0); //TODO check
+
+ const __m128i sc128 = lasx_extracti128(mins_and_scales, 0);
+ const __m256i scales = lasx_insertf128(sc128, sc128);
+
+ const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
+ __m256i hmask = mone;
+
+ __m256i sumi = __lasx_xvldi(0);
+
+ int bit = 0;
+
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+ const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+ const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
+
+ const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
+ const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
+ const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
+ hmask = __lasx_xvslli_h(hmask, 1);
+
+ const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
+ const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
+ const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
+ hmask = __lasx_xvslli_h(hmask, 1);
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+ __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0);
+ __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1);
+
+ p16_0 = lasx_madd_h(scale_0, p16_0);
+ p16_1 = lasx_madd_h(scale_1, p16_1);
+
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+ }
+
+ __m256 vd = __lasx_xvreplfr2vr_s(d);
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
#else
const uint8_t * scales = (const uint8_t*)&utmp[0];
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+ const __m256i mone = __lasx_xvreplgr2vr_b(1);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q5 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+ const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0);
+
+ const __m256i scale_l = lasx_insertf128(__lsx_vreplgr2vr_h(x[i].scales[1]), __lsx_vreplgr2vr_h(x[i].scales[0]));
+ const __m256i scale_h = lasx_insertf128(__lsx_vreplgr2vr_h(x[i].scales[3]), __lsx_vreplgr2vr_h(x[i].scales[2]));
+
+ int64_t aux64;
+ memcpy(&aux64, x[i].qh, 8);
+ __m128i haux128 = __lsx_vinsgr2vr_d(haux128, aux64, 0);
+ haux128 = __lsx_vinsgr2vr_d(haux128, aux64 >> 1, 1);
+ const __m256i haux256 = lasx_insertf128(__lsx_vsrli_h(haux128, 2), haux128);
+
+ const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvandn_v(haux256, mone), 4);
+ const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvandn_v(__lasx_xvsrli_h(haux256, 4), mone), 4);
+
+ const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
+ const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0);
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0);
+
+ const __m256i p16_0 = lasx_madd_h(scale_l, lasx_maddubs_h(q5l_0, q8_0));
+ const __m256i p16_1 = lasx_madd_h(scale_h, lasx_maddubs_h(q5l_1, q8_1));
+ const __m256i s16_0 = lasx_madd_h(scale_l, lasx_maddubs_h(q5h_0, q8_0));
+ const __m256i s16_1 = lasx_madd_h(scale_h, lasx_maddubs_h(q5h_1, q8_1));
+
+ const __m256i dot = __lasx_xvsub_w(__lasx_xvadd_w(p16_0, p16_1), __lasx_xvadd_w(s16_0, s16_1));
+
+ acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(dot), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
*s = vec_extract(vsumf0, 0);
-#else
+#elif defined __loongarch_asx
- int8_t aux8[QK_K];
- int16_t aux16[8];
- float sums [8];
- int32_t aux32[8];
- memset(sums, 0, 8*sizeof(float));
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+ const __m256i m2 = __lasx_xvreplgr2vr_b(3);
+ const __m256i m32s = __lasx_xvreplgr2vr_b(32);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
- float sumf = 0;
for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
const uint8_t * restrict q4 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
- const int8_t * restrict q8 = y[i].qs;
- memset(aux32, 0, 8*sizeof(int32_t));
- int8_t * restrict a = aux8;
- for (int j = 0; j < QK_K; j += 128) {
- for (int l = 0; l < 32; ++l) {
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
- }
- a += 128;
- q4 += 64;
- qh += 32;
- }
- a = aux8;
- int is = 0;
- for (int j = 0; j < QK_K/16; ++j) {
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+
+ __m256i sumi = __lasx_xvldi(0);
+
+ int is = 0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
+ const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
+ const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
+ const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
+ is += 4;
+
+ const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+ const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+ const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
+
+ const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4);
+ const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4);
+ const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4);
+ const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4);
+
+ const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
+ const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1);
+ const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2);
+ const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3);
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+ __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
+ __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
+ __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2);
+ __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3);
+
+ __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
+ __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
+ __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
+ __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
+
+ p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+ p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+ p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+ p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+ p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
+ p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
+ p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
+ p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
+
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
+ }
+
+ acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
+#else
+
+ int8_t aux8[QK_K];
+ int16_t aux16[8];
+ float sums [8];
+ int32_t aux32[8];
+ memset(sums, 0, 8*sizeof(float));
+
+ float sumf = 0;
+ for (int i = 0; i < nb; ++i) {
+ const uint8_t * restrict q4 = x[i].ql;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+ memset(aux32, 0, 8*sizeof(int32_t));
+ int8_t * restrict a = aux8;
+ for (int j = 0; j < QK_K; j += 128) {
+ for (int l = 0; l < 32; ++l) {
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+ }
+ a += 128;
+ q4 += 64;
+ qh += 32;
+ }
+ a = aux8;
+ int is = 0;
+ for (int j = 0; j < QK_K/16; ++j) {
int scale = x[i].scales[is++];
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
*s = vec_extract(vsumf0, 0);
+#elif defined __loongarch_asx
+
+ const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+ const __m256i m2 = __lasx_xvreplgr2vr_b(3);
+ const __m256i m32s = __lasx_xvreplgr2vr_b(32);
+
+ __m256 acc = (__m256)__lasx_xvldi(0);
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * restrict q4 = x[i].ql;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m64 scales_1 = __lasx_xvreplgr2vr_b(x[i].scales[0]);
+ const __m64 scales_2 = __lasx_xvreplgr2vr_b(x[i].scales[1]);
+ const __m64 scales_3 = __lasx_xvreplgr2vr_b(x[i].scales[2]);
+ const __m64 scales_4 = __lasx_xvreplgr2vr_b(x[i].scales[3]);
+
+ __m256i sumi = __lasx_xvldi(0);
+
+ __m128i scale_0 = __lsx_vinsgr2vr_d(scale_0, scales_1, 0);
+ scale_0 = __lsx_vinsgr2vr_d(scale_0, scales_2, 1);
+ __m128i scale_1 = __lsx_vinsgr2vr_d(scale_1, scales_3, 0);
+ scale_1 = __lsx_vinsgr2vr_d(scale_1, scales_4, 1);
+
+ const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0);
+ const __m128i q4bitsH = __lsx_vld((const __m128i*)qh, 0);
+
+ const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(lasx_insertf128(__lasx_xvsrli_h(q4bitsH, 2), q4bitsH), m2), 4);
+ const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(lasx_insertf128(__lasx_xvsrli_h(q4bitsH, 6), __lasx_xvsrli_h(q4bitsH, 4)), m2), 4);
+
+ const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
+ const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_1);
+
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0);
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0);
+
+ __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
+ __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
+
+ __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
+ __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
+
+ p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+ p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+
+ p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
+ p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
+
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
#endif
-#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__)
+#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
static const int8_t keven_signs_q2xs[1024] = {
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[4];
+ const uint8_t * aux8 = (const uint8_t *)aux32;
+
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+
+ const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+ const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+ const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
+ const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+ const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
+ const uint16_t ls1 = aux32[1] >> 28;
+ const uint16_t ls2 = aux32[3] >> 28;
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
+ }
+
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
#else
uint32_t aux32[2];
*s = 0.125f * hsum_float_8(accumf);
#endif
+#elif defined(__loongarch_asx)
+
+ const __m256i mone = __lasx_xvreplgr2vr_b(1);
+ static const char block_sign_shuffle_mask_1[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ };
+ static const char block_sign_shuffle_mask_2[32] = {
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+ };
+ static const uint8_t bit_selector_mask_bytes[32] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
+ const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
+ const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
+
+#if QK_K == 64
+ static const uint8_t k_bit_helper[16] = {
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ };
+ const __m128i bit_helper = __lsx_vld((const __m128i*)k_bit_helper, 0);
+ const __m128i m511 = __lsx_vreplgr2vr_h(511);
+ typedef union {
+ __m128i vec_index;
+ uint16_t index[8];
+ } index_t;
+
+ index_t idx;
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const __m128i q2_data = __lsx_vld((const __m128i*)x[i].qs, 0);
+ idx.vec_index = __lsx_vand_v(q2_data, m511);
+
+ const __m128i partial_sign_bits = __lsx_vsrli_h(q2_data, 9);
+ const __m128i partial_sign_bits_upper = __lsx_vsrli_h(q2_data, 13);
+ const __m128i partial_sign_bits_for_counting = __lsx_vxor_v(partial_sign_bits, partial_sign_bits_upper);
+
+ const __m128i odd_bits = lsx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
+ const __m128i full_sign_bits = __lsx_vor_v(partial_sign_bits, odd_bits);
+ const __m256i full_signs = lasx_insertf128(full_sign_bits, full_sign_bits);
+
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)y[i].qs, 0);
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)(y[i].qs+32), 0);
+
+ const __m256i q2_1 = lasx_set_d(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
+ iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
+ const __m256i q2_2 = lasx_set_d(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
+ iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
+ __m256i signs;
+ signs = lasx_shuffle_b(full_signs, block_sign_shuffle_1);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
+
+ signs = lasx_shuffle_b(full_signs, block_sign_shuffle_2);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
+
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
+
+ const __m256i sc1 = lasx_insertf128(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), __lsx_vreplgr2vr_h(2*(x[i].scales[0] & 0xf)+1));
+ const __m256i sc2 = lasx_insertf128(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), __lsx_vreplgr2vr_h(2*(x[i].scales[1] & 0xf)+1));
+
+ const __m256i sum = __lasx_xvadd_w(lasx_madd_h(sc1, dot1), lasx_madd_h(sc2, dot2));
+
+ accumf = __lasx_vfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sum), accumf);
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+#else
+
+ static const uint8_t k_bit_helper[32] = {
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ };
+ const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
+ const __m256i m511 = __lasx_xvreplgr2vr_h(511);
+ const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+ const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+ uint64_t aux64;
+
+ // somewhat hacky, but gives a significant boost in performance
+ __m256i aux_gindex;
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ memcpy(&aux64, x[i].scales, 8);
+ __m128i stmp = __lsx_vreplgr2vr_d(aux64);
+ stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
+ const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
+
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+ const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0); q2 += 16;
+ aux_gindex = __lasx_xvand_v(q2_data, m511);
+
+ const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
+ const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
+ const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
+
+ const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
+ const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
+
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+
+ const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+ iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+ const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+ iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+ const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+ iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+ const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+ iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+ const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
+ const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
+ const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
+ const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
+
+ __m256i signs;
+ signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
+
+ signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
+
+ signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
+
+ signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
+
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
+ const __m256i dot3 = lasx_maddubs_h(q2_3, q8s_3);
+ const __m256i dot4 = lasx_maddubs_h(q2_4, q8s_4);
+
+ const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
+ const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
+ const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
+ const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
+
+ sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
+ sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
+ sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
+ sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
+ }
+
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+#endif
+
#elif defined(__POWER9_VECTOR__)
vector float vsumf0 = vec_splats(0.0f);
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+
+ const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+ const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+ const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+ const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+ uint64_t aux64;
+
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * restrict qs = x[i].qs;
+ const uint8_t * restrict qh = x[i].qh;
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * restrict q8 = y[i].qs;
+
+ __m128i tmp1;
+ memcpy(&aux64, x[i].scales, 8);
+ tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
+ tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
+ const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
+ const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+ const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+ qs += 8;
+
+ __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+ const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+ const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+ aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+ const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+ const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+ signs += 4;
+
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+ const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
+ const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
+ }
+
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
#else
float sumf = 0;
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = 0.25f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[2];
+
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * restrict q3 = x[i].qs;
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
+ const int8_t * restrict q8 = y[i].qs;
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ q3 += 8;
+ const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ q3 += 8;
+ memcpy(aux32, gas, 8); gas += 8;
+
+ const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
+ const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+ const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
+ const uint16_t ls1 = aux32[0] >> 28;
+ const uint16_t ls2 = aux32[1] >> 28;
+
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
+ }
+
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+ }
+
+ *s = 0.25f * hsum_float_8(accumf);
+
#else
uint32_t aux32;
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+ const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+
+ __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
+ const __m256i idx_mask = __lasx_xvreplgr2vr_w(256);
+
+ typedef union {
+ __m256i vec[2];
+ uint32_t index[16];
+ } index_t;
+
+ index_t idx;
+
+ __m256 accumf = (__m256)__lasx_xvldi(0);
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * restrict qs = x[i].qs;
+ const uint8_t * restrict qh = x[i].qh;
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+ const int8_t * restrict q8 = y[i].qs;
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
+ idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
+ idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
+ idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
+ idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
+ idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
+ idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
+
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+ const __m256i q2_1 = lasx_set_w(
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+ );
+ const __m256i q2_2 = lasx_set_w(
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+ );
+
+ __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+ const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+ const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+ aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+ const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+ const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+ signs += 4;
+
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
+ }
+
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+ }
+
+ *s = hsum_float_8(accumf);
+
#else
float sumf = 0.f;
}
-#ifdef __AVX2__
+#if defined(__AVX2__)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i ax = _mm256_sign_epi8(x, x);
const __m256i sy = _mm256_sign_epi8(y, x);
return _mm256_maddubs_epi16(ax, sy);
}
+#elif defined(__loongarch_asx)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+ const __m256i ax = __lasx_xvsigncov_b(x, x);
+ const __m256i sy = __lasx_xvsigncov_b(x, y);
+ __m256i tmp1, tmp2, tmp3;
+ tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
+ tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
+ tmp3 = __lasx_xvadd_h(tmp1, tmp2);
+ return __lasx_xvsat_h(tmp3, 15);
+}
#endif
void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ __m256 accum = (__m256)__lasx_xvldi(0);
+ float accum1 = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint16_t * qh = x[i].qh;
+
+ __m256i sumi = __lasx_xvldi(0);
+ int sumi1 = 0;
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
+
+ __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
+
+ qs += 8;
+ const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+ const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+
+ __m256i tmp1, tmp5, tmp6;
+ tmp1 = __lasx_xvreplgr2vr_h(ls1);
+ tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
+ tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
+ const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
+
+ tmp1 = __lasx_xvreplgr2vr_h(ls2);
+ tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
+ tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
+ const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
+
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+ }
+
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+ accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
+ accum1 += d * sumi1;
+ }
+
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
#else
float sumf = 0;
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined (__loongarch_asx)
+
+ const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+ const __m128i m4b = __lsx_vreplgr2vr_b(0x0f);
+ const __m256i mone = __lasx_xvreplgr2vr_h(1);
+
+ __m256 accum1 = (__m256)__lasx_xvldi(0);
+ __m256 accum2 = (__m256)__lasx_xvldi(0);
+ for (int ib = 0; ib < nb; ib += 2) {
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[0].qs, 0);
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[1].qs, 0);
+ const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[0].qs, 0);
+ const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[1].qs, 0);
+ const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
+ lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
+ const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
+ lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const __m256i p_1 = lasx_madd_h(p16_1, mone);
+ const __m256i p_2 = lasx_madd_h(p16_2, mone);
+ accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
+ __lasx_xvffint_s_w(p_1), accum1);
+ accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
+ __lasx_xvffint_s_w(p_2), accum2);
+
+ y += 2;
+ x += 2;
+ }
+
+ *s = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
+
#else
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
*s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+ const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+ const __m128i m4b = __lsx_vreplgr2vr_b(0x0f);
+
+ __m256 accum = (__m256)__lasx_xvldi(0);
+ __m256i tmp1;
+ __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
+
+ mask_8f = __lsx_vreplgr2vr_b(0x8f);
+ for (int ibl = 0; ibl < nb; ++ibl) {
+ const uint8_t * qs = x[ibl].qs;
+ const int8_t * q8 = y[ibl].qs;
+ uint16_t sh = x[ibl].scales_h;
+ __m256i sumi1 = __lasx_xvldi(0);
+ __m256i sumi2 = __lasx_xvldi(0);
+ __m128i zero = __lsx_vldi(0);
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+ const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+ tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f);
+ tmp0 = __lsx_vori_b(tmp2, 0x10);
+ mask = __lsx_vsle_b(zero, tmp2);
+ tmp3 = __lsx_vand_v(tmp0, mask);
+ tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+ tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
+ tmp0 = __lsx_vori_b(tmp2, 0x10);
+ mask = __lsx_vsle_b(zero, tmp2);
+ tmp4 = __lsx_vand_v(tmp0, mask);
+ tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+ const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
+
+ tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
+ tmp0 = __lsx_vori_b(tmp2, 0x10);
+ mask = __lsx_vsle_b(zero, tmp2);
+ tmp3 = __lsx_vand_v(tmp0, mask);
+ tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+ tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
+ tmp0 = __lsx_vori_b(tmp2, 0x10);
+ mask = __lsx_vsle_b(zero, tmp2);
+ tmp4 = __lsx_vand_v(tmp0, mask);
+ tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+ const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
+
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
+ sh >>= 4;
+ __m256i tmp5, tmp6;
+ tmp1 = __lasx_xvreplgr2vr_h(ls1);
+ tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
+ tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
+ const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
+ tmp1 = __lasx_xvreplgr2vr_h(ls2);
+ tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
+ tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
+ const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
+ sumi1 = __lasx_xvadd_w(p_1, sumi1);
+ sumi2 = __lasx_xvadd_w(p_2, sumi2);
+ }
+ accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+ __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
+ }
+
+ *s = hsum_float_8(accum);
+
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {