vec_extract(x[0], 2) + \
vec_extract(x[0], 3); \
}
+#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
+{ \
+ vector float v = vec_add(vec_add(s0, s1), \
+ vec_add(s2, s3)); \
+ v = vec_add(v, vec_sld(v, v, 8)); \
+ v = vec_add(v, vec_sld(v, v, 4)); \
+ res += (ggml_float) vec_extract(v, 0); \
+}
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
r[i - GGML_ENDIAN_BYTE(0)]), \
0, p - GGML_F16_EPR)
+//BF16 POWER9
+#define GGML_BF16_STEP 16
+#define GGML_BF16_EPR 8
+
+#define GGML_BF16x8 vector unsigned short
+#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
+#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
+
+#define GGML_BF16_VEC GGML_BF16x8
+#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
+#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
+#if defined(__LITTLE_ENDIAN__)
+#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
+#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
+#else
+#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
+#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
+#endif
+#define GGML_BF16_FMA_LO(acc, x, y) \
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
+#define GGML_BF16_FMA_HI(acc, x, y) \
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
+
#elif defined(__wasm_simd128__)
#define GGML_SIMD
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
#endif
+#if defined(__POWER9_VECTOR__)
+ const int np = (n & ~(GGML_BF16_STEP - 1));
+ if (np > 0) {
+ GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO};
+ for (; i < np; i += GGML_BF16_STEP) {
+ GGML_BF16_VEC vx0 = GGML_BF16_VEC_LOAD(x + i);
+ GGML_BF16_VEC vx1 = GGML_BF16_VEC_LOAD(x + i + 8);
+ GGML_BF16_VEC vy0 = GGML_BF16_VEC_LOAD(y + i);
+ GGML_BF16_VEC vy1 = GGML_BF16_VEC_LOAD(y + i + 8);
+ GGML_BF16_FMA_LO(sum[0], vx0, vy0);
+ GGML_BF16_FMA_HI(sum[1], vx0, vy0);
+ GGML_BF16_FMA_LO(sum[2], vx1, vy1);
+ GGML_BF16_FMA_HI(sum[3], vx1, vy1);
+ }
+ GGML_F32x4_REDUCE_4(sumf, sum[0], sum[1], sum[2], sum[3]);
+ }
+#endif
+
for (; i < n; ++i) {
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
GGML_BF16_TO_FP32(y[i]));