additional optimizations for POWER9 (#454)

author Cameron Kaiser <redacted>

Fri, 24 Mar 2023 15:19:26 +0000 (08:19 -0700)

committer GitHub <redacted>

Fri, 24 Mar 2023 15:19:26 +0000 (17:19 +0200)
author Cameron Kaiser <redacted>
Fri, 24 Mar 2023 15:19:26 +0000 (08:19 -0700)
committer GitHub <redacted>
Fri, 24 Mar 2023 15:19:26 +0000 (17:19 +0200)
diff --git a/Makefile b/Makefile

index 91eebaebd469bf7d6367c11531e2fd6281ee34b4..e8b128cb8c71d6dba7001dc991c020f1bffea874 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -156,7 +156,8 @@ endif
  ifneq ($(filter ppc64%,$(UNAME_M)),)
         POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
         ifneq (,$(findstring POWER9,$(POWER9_M)))
-               CFLAGS += -mpower9-vector
+               CFLAGS += -mcpu=power9
+               CXXFLAGS += -mcpu=power9
         endif
         # Require c++23's std::byteswap for big-endian support.
         ifeq ($(UNAME_M),ppc64)
diff --git a/ggml.c b/ggml.c

index 800390a8821aa96e9bd0ac93d1d097570507af4e..92b857a0007ac53dfa82873d36e017626c5c4176 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -175,6 +175,39 @@ typedef double ggml_float;
  #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
  #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
  
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
  #else
  
  // FP16 <-> FP32
@@ -272,6 +305,7 @@ static float table_f32_f16[1 << 16];
  
  // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
  // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
  #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
  
  inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
@@ -462,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
      assert(k % QK == 0);
  
-#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
+#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
      const int nb = k / QK;
      const size_t bs = sizeof(float) + QK/2;
  
@@ -472,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
      uint8_t pp[QK/2];
  #endif
  
-#if __ARM_NEON
+#if defined(__POWER9_VECTOR__)
+#if QK == 32
+    const vector float v85 = vec_splats(8.5f);
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
+        //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[2]);
+        amaxv[4] = vec_max(amaxv[4], amaxv[6]);
+        //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[4]);
+
+        amax = MAX(
+                MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
+                MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 3) - 1);
+        const float id = d ? 1.0/d : 0.0;
+
+        *(float *)pd = d;
+        pd += bs;
+
+        const vector float vid = vec_splats(id);
+        for (int l = 0; l < 8; l++) {
+            const vector float vf  = vec_madd(srcv[l], vid, v85);
+            const vector signed int vi = vec_signed(vf);
+
+            pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
+            pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
+        }
+
+        //memcpy(pb, pp, sizeof(pp));
+        pb += bs;
+    }
+#else
+#error "not implemented for QK"
+#endif
+#elif __ARM_NEON
  #if QK == 32
      for (int i = 0; i < nb; i++) {
          float amax = 0.0f; // absolute max
author	Cameron Kaiser <redacted>
	Fri, 24 Mar 2023 15:19:26 +0000 (08:19 -0700)
committer	GitHub <redacted>
	Fri, 24 Mar 2023 15:19:26 +0000 (17:19 +0200)