#elif defined(__POWER9_VECTOR__)
-// TODO: uncomment this when it works
-//#define GGML_SIMD
+#define GGML_SIMD
// F32 POWER9
#define GGML_F32_STEP 32
-#define GGML_F32_EPR 8
+#define GGML_F32_EPR 4
-// TODO: not tested !!
-#define GGML_F32x4 __vector float
-#define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
-#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
-#define GGML_F32x4_LOAD vec_vsx_ld
-#define GGML_F32x4_STORE vec_vsx_st
+#define GGML_F32x4 vector float
+#define GGML_F32x4_ZERO 0.0f
+#define GGML_F32x4_SET1 vec_splats
+#define GGML_F32x4_LOAD(p) vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD vec_add
-#define GGML_F32x4_MUL vec_mul
+#define GGML_F32x4_ADD vec_add
+#define GGML_F32x4_MUL vec_mul
#define GGML_F32x4_REDUCE(res, x) \
{ \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
// F16 POWER9
-// TODO: implement here
-// ...
+#define GGML_F16_STEP GGML_F32_STEP
+#define GGML_F16_EPR GGML_F32_EPR
+#define GGML_F16_VEC GGML_F32x4
+#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA GGML_F32x4_FMA
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_F16_VEC_STORE(p, r, i) \
+ if (i & 0x1) \
+ vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR)
#elif defined(__wasm_simd128__)
for (int i = np; i < n; ++i) {
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
}
-#elif defined(__POWER9_VECTOR__)
- // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
- // being able to test it. hoping someone with access to a POWER9 machine can help out here.
- const int n32 = (n & ~31);
-
- vector float sum0 = vec_splats (0.0f);
- vector float sum1 = vec_splats (0.0f);
- vector float sum2 = vec_splats (0.0f);
- vector float sum3 = vec_splats (0.0f);
- vector float sum4 = vec_splats (0.0f);
- vector float sum5 = vec_splats (0.0f);
- vector float sum6 = vec_splats (0.0f);
- vector float sum7 = vec_splats (0.0f);
-
- for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
- // Use vec_xl, not vec_ld, because x is sometimes unaligned.
- vector unsigned short x0 = vec_xl(j + 0, x);
- vector unsigned short x1 = vec_xl(j + 16, x);
- vector unsigned short x2 = vec_xl(j + 32, x);
- vector unsigned short x3 = vec_xl(j + 48, x);
-
- vector unsigned short y0 = vec_ld(j + 0, y);
- vector unsigned short y1 = vec_ld(j + 16, y);
- vector unsigned short y2 = vec_ld(j + 32, y);
- vector unsigned short y3 = vec_ld(j + 48, y);
-
- vector float fx0l = vec_extract_fp32_from_shortl(x0);
- vector float fx0h = vec_extract_fp32_from_shorth(x0);
- vector float fx1l = vec_extract_fp32_from_shortl(x1);
- vector float fx1h = vec_extract_fp32_from_shorth(x1);
- vector float fx2l = vec_extract_fp32_from_shortl(x2);
- vector float fx2h = vec_extract_fp32_from_shorth(x2);
- vector float fx3l = vec_extract_fp32_from_shortl(x3);
- vector float fx3h = vec_extract_fp32_from_shorth(x3);
-
- vector float fy0l = vec_extract_fp32_from_shortl(y0);
- vector float fy0h = vec_extract_fp32_from_shorth(y0);
- vector float fy1l = vec_extract_fp32_from_shortl(y1);
- vector float fy1h = vec_extract_fp32_from_shorth(y1);
- vector float fy2l = vec_extract_fp32_from_shortl(y2);
- vector float fy2h = vec_extract_fp32_from_shorth(y2);
- vector float fy3l = vec_extract_fp32_from_shortl(y3);
- vector float fy3h = vec_extract_fp32_from_shorth(y3);
-
- sum0 = vec_madd(fx0l, fy0l, sum0);
- sum1 = vec_madd(fx0h, fy0h, sum1);
- sum2 = vec_madd(fx1l, fy1l, sum2);
- sum3 = vec_madd(fx1h, fy1h, sum3);
- sum4 = vec_madd(fx2l, fy2l, sum4);
- sum5 = vec_madd(fx2h, fy2h, sum5);
- sum6 = vec_madd(fx3l, fy3l, sum6);
- sum7 = vec_madd(fx3h, fy3h, sum7);
- }
-
- sum0 = vec_add(sum0, sum1);
- sum2 = vec_add(sum2, sum3);
- sum4 = vec_add(sum4, sum5);
- sum6 = vec_add(sum6, sum7);
-
- sum0 = vec_add(sum0, sum2);
- sum4 = vec_add(sum4, sum6);
-
- sum0 = vec_add(sum0, sum4);
-
- sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
- + vec_extract(sum0, 2) + vec_extract(sum0, 3);
-
- for (int i = n32; i < n; ++i) {
- sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
- }
#else
for (int i = 0; i < n; ++i) {
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
GGML_ASSERT(false);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
-#elif defined(__POWER9_VECTOR__)
- // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
- // being able to test it. hoping someone with access to a POWER9 machine can help out here.
- const int n32 = (n & ~31);
- for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
- // Use vec_xl, not vec_ld, because x is sometimes unaligned!
- vector unsigned short x0 = vec_xl(j + 0, x);
- vector unsigned short x1 = vec_xl(j + 16, x);
- vector unsigned short x2 = vec_xl(j + 32, x);
- vector unsigned short x3 = vec_xl(j + 48, x);
-
- vector unsigned short y0 = vec_xl(j + 0, y);
- vector unsigned short y1 = vec_xl(j + 16, y);
- vector unsigned short y2 = vec_xl(j + 32, y);
- vector unsigned short y3 = vec_xl(j + 48, y);
-
- vector float v4 = vec_splats(v);
-
- vector float fx0l = vec_extract_fp32_from_shortl(x0);
- vector float fx0h = vec_extract_fp32_from_shorth(x0);
- vector float fx1l = vec_extract_fp32_from_shortl(x1);
- vector float fx1h = vec_extract_fp32_from_shorth(x1);
- vector float fx2l = vec_extract_fp32_from_shortl(x2);
- vector float fx2h = vec_extract_fp32_from_shorth(x2);
- vector float fx3l = vec_extract_fp32_from_shortl(x3);
- vector float fx3h = vec_extract_fp32_from_shorth(x3);
-
- vector float fy0l = vec_extract_fp32_from_shortl(y0);
- vector float fy0h = vec_extract_fp32_from_shorth(y0);
- vector float fy1l = vec_extract_fp32_from_shortl(y1);
- vector float fy1h = vec_extract_fp32_from_shorth(y1);
- vector float fy2l = vec_extract_fp32_from_shortl(y2);
- vector float fy2h = vec_extract_fp32_from_shorth(y2);
- vector float fy3l = vec_extract_fp32_from_shortl(y3);
- vector float fy3h = vec_extract_fp32_from_shorth(y3);
-
- fy0l = vec_madd(fx0l, v4, fy0l);
- fy0h = vec_madd(fx0h, v4, fy0h);
- fy1l = vec_madd(fx1l, v4, fy1l);
- fy1h = vec_madd(fx1h, v4, fy1h);
- fy2l = vec_madd(fx2l, v4, fy2l);
- fy2h = vec_madd(fx2h, v4, fy2h);
- fy3l = vec_madd(fx3l, v4, fy3l);
- fy3h = vec_madd(fx3h, v4, fy3h);
-
- y0 = vec_pack_to_short_fp32(fy0h, fy0l);
- y1 = vec_pack_to_short_fp32(fy1h, fy1l);
- y2 = vec_pack_to_short_fp32(fy2h, fy2l);
- y3 = vec_pack_to_short_fp32(fy3h, fy3l);
-
- vec_xst(y0, j + 0, y);
- vec_xst(y1, j + 16, y);
- vec_xst(y2, j + 32, y);
- vec_xst(y3, j + 48, y);
- }
-
- for (int i = n32; i < n; ++i) {
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
- }
#else
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);