return ((v4f32)res)[0];
}
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+ // Get absolute values of x vectors
+ const __m128i ax = __lsx_vsigncov_b(x, x);
+ // Sign the values of the y vectors
+ const __m128i sy = __lsx_vsigncov_b(x, y);
+ // Perform multiplication and create 16-bit values
+ const __m128i dot = lsx_maddubs_h(ax, sy);
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
+ return lsx_madd_h(ones, dot);
+}
#endif
#if defined(__loongarch_asx)
}
}
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
- // Get absolute values of x vectors
- const __m128i ax = __lsx_vsigncov_b(x, x);
- // Sign the values of the y vectors
- const __m128i sy = __lsx_vsigncov_b(x, y);
- // Perform multiplication and create 16-bit values
- const __m128i dot = lsx_maddubs_h(ax, sy);
- const __m128i ones = __lsx_vreplgr2vr_h(1);
- return lsx_madd_h(ones, dot);
-}
-
// horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) {
__m128 res = lasx_extractf128(x, 1);
#define GGML_F32_EPR 4
#define GGML_F32x4 __m128
-#define GGML_F32x4_ZERO __lsx_vldi(0)
-#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
+ const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
- return __lsx_vld(tmp, 0);
+ return (__m128)__lsx_vld(tmp, 0);
}
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
}
#define GGML_F32Cx4 __m128
-#define GGML_F32Cx4_ZERO __lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
+#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD __lsx_vfadd_s