ggml : always define ggml_fp16_t as uint16_t (llama/5666)

author Georgi Gerganov <redacted>

Thu, 22 Feb 2024 21:21:39 +0000 (23:21 +0200)

committer Georgi Gerganov <redacted>

Thu, 22 Feb 2024 21:25:33 +0000 (23:25 +0200)
author Georgi Gerganov <redacted>
Thu, 22 Feb 2024 21:21:39 +0000 (23:21 +0200)
committer Georgi Gerganov <redacted>
Thu, 22 Feb 2024 21:25:33 +0000 (23:25 +0200)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index e7c211d7d6087075eba4aa80d7fb815db867de0e..b0e454e025ec4608de5b77da7f108d11e9232ea2 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,3 +1,7 @@
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
  #include <algorithm>
  #include <assert.h>
  #include <atomic>
@@ -121,11 +125,6 @@
  
  #endif // defined(GGML_USE_HIPBLAS)
  
-// ggml-cuda need half type so keep ggml headers include at last
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
  #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
  
  #define CC_PASCAL     600
diff --git a/ggml-impl.h b/ggml-impl.h

index 19df66bceee4a06beb409dd3303bc1ca462fcc11..c5637e4d45d8c5174472ed4c8cd86ab10f5d6dbe 100644 (file)
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -53,11 +53,23 @@ extern "C" {
  //
  #include <arm_neon.h>
  
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    __fp16 tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
  
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    __fp16 tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}
  
  #else
  
@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
  // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
  // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
  // This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
+#if !defined(GGML_FP16_TO_FP32)
  inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
      uint16_t s;
      memcpy(&s, &f, sizeof(uint16_t));
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
  }
  
  #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
  
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
  #endif
  
  #define GGML_HASHTABLE_FULL ((size_t)-1)
diff --git a/ggml-quants.c b/ggml-quants.c

index 8917c8af142558f9cdb9fd6cf142ed9c3e17a726..b15977f53e2f3f279352f8fa97145a36ea266550 100644 (file)
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
          int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
  
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
          q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
          int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
  
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
  
@@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
          aux16[1] = (a[0] >> 4) & 0x0f0f;
  
          const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * (float)x[i].d[1] * summi;
+        sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
  
-        const float d = y[i].d * (float)x[i].d[0];
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
  
          const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
  
@@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
          const int8_t * sc = x[i].scales;
  
          const uint8_t * restrict q5 = x[i].qs;
@@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
          const int8_t * sc = x[i].scales;
  
          const uint8_t * restrict q5 = x[i].qs;
@@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q6 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q6 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
      float sumf = 0;
  
      for (int ib = 0; ib < nb; ib += 2) {
-
          q4bits.val[0] = vld1q_u8(x[ib+0].qs);
          q4bits.val[1] = vld1q_u8(x[ib+1].qs);
          q8b.val[0]    = vld1q_s8(y[ib+0].qs);
@@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
          prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
          prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
  
-        sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
-
+        sumf +=
+            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
+            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
      }
  
      *s = sumf;
diff --git a/ggml.c b/ggml.c

index 5b9fa741a64799b151ad5a37bc960525036aecaf..d710fe702ddbdd31a14bc22c375299d987c241cb 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
  // note: do not use these inside ggml.c
  // these are meant to be used via the ggml.h API
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
+    return GGML_FP16_TO_FP32(x);
  }
  
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
      #define GGML_F16x8              float16x8_t
      #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
      #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD         vld1q_f16
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
      #define GGML_F16x8_STORE        vst1q_f16
      #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
      #define GGML_F16x8_ADD          vaddq_f16
@@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
      #define GGML_F32Cx4              float32x4_t
      #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
      #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
      #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
      #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
      #define GGML_F32Cx4_ADD          vaddq_f32
diff --git a/ggml.h b/ggml.h

index bed7a36a0ee6a3e61da0ce160764b9d568f0c178..37eff627928e811a26f4634a9478c3b9341cc93e 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -315,13 +315,7 @@
  extern "C" {
  #endif
  
-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 ggml_fp16_t;
-#else
      typedef uint16_t ggml_fp16_t;
-#endif
  
      // convert FP16 <-> FP32
      GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
author	Georgi Gerganov <redacted>
	Thu, 22 Feb 2024 21:21:39 +0000 (23:21 +0200)
committer	Georgi Gerganov <redacted>
	Thu, 22 Feb 2024 21:25:33 +0000 (23:25 +0200)
ggml-cuda.cu		patch \| blob \| history
ggml-impl.h		patch \| blob \| history
ggml-quants.c		patch \| blob \| history
ggml.c		patch \| blob \| history
ggml.h		patch \| blob \| history