for (; ib + 1 < nb; ib += 2) {
// Compute combined scale for the block 0 and 1
- const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+ const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+ const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
bx_1 = __lsx_vsub_b(bx_1, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
- //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
- //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
// Compute combined scale for the block 2 and 3
- const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+ const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
+ const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
#define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
__m256i a;
#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
-#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
#define GGML_F32x4_MUL __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x) \
-{ \
- int offset = GGML_F32_ARR >> 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
- } \
- offset >>= 1; \
- for (int i = 0; i < offset; ++i) { \
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
- } \
- __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
- const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
- tmp = __lsx_vsrli_d((__m128i) t0, 32); \
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
- res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
+ __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
+ __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
+ __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
+ __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
+ __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
+ res = (ggml_float) ((v4f32)t5)[0]; \
}
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA