ggml : move FP16 <-> FP32 code to ggml-impl.h (#3861)

author Georgi Gerganov <redacted>

Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)

committer GitHub <redacted>

Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)
author Georgi Gerganov <redacted>
Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)
committer GitHub <redacted>
Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)
diff --git a/ggml-impl.h b/ggml-impl.h

new file mode 100644 (file)

index 0000000..5ec18a5
--- /dev/null
+++ b/ggml-impl.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include "ggml.h"
+
+// GGML internal header
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+
+#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_FP32_TO_FP16(x) (x)
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+    // TODO: backend v2 PR
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml-quants.c b/ggml-quants.c

index fd4ee1be64befa860e141b5c4bd1597165a27c5c..72159446738e3083d6c341d3ade10de3463c096d 100644 (file)
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1,5 +1,5 @@
  #include "ggml-quants.h"
-#include "ggml.h"
+#include "ggml-impl.h"
  
  #include <math.h>
  #include <string.h>
@@ -352,7 +352,7 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
          const float d  = max / -8;
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          for (int j = 0; j < qk/2; ++j) {
              const float x0 = x[i*qk + 0    + j]*id;
@@ -392,8 +392,8 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
          const float d  = (max - min) / ((1 << 4) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
-        y[i].m = ggml_fp32_to_fp16(min);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
  
          for (int j = 0; j < qk/2; ++j) {
              const float x0 = (x[i*qk + 0    + j] - min)*id;
@@ -434,7 +434,7 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
          const float d  = max / -16;
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          uint32_t qh = 0;
  
@@ -481,8 +481,8 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
          const float d  = (max - min) / ((1 << 5) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
-        y[i].m = ggml_fp32_to_fp16(min);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
  
          uint32_t qh = 0;
  
@@ -524,7 +524,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
          const float d = amax / ((1 << 7) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          for (int j = 0; j < QK8_0; ++j) {
              const float x0 = x[i*QK8_0 + j]*id;
@@ -559,7 +559,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
          const float d = amax / ((1 << 7) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          for (int j = 0; j < 8; j++) {
              const float32x4_t v  = vmulq_n_f32(srcv[j], id);
@@ -592,7 +592,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
          const float d = amax / ((1 << 7) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          for (int j = 0; j < 8; j++) {
              const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
@@ -627,7 +627,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
  
          // Quantize these floats
          const float d = maxScalar / 127.f;
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
          const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
          const __m256 mul = _mm256_set1_ps( id );
  
@@ -704,7 +704,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
          const float d = amax / ((1 << 7) - 1);
          const float id = d ? 1.0f/d : 0.0f;
  
-        y[i].d = ggml_fp32_to_fp16(d);
+        y[i].d = GGML_FP32_TO_FP16(d);
  
          vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
  
@@ -982,7 +982,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
      const int nb = k / qk;
  
      for (int i = 0; i < nb; i++) {
-        const float d = ggml_fp16_to_fp32(x[i].d);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
  
          for (int j = 0; j < qk/2; ++j) {
              const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1002,8 +1002,8 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
      const int nb = k / qk;
  
      for (int i = 0; i < nb; i++) {
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float m = ggml_fp16_to_fp32(x[i].m);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
  
          for (int j = 0; j < qk/2; ++j) {
              const int x0 = (x[i].qs[j] & 0x0F);
@@ -1023,7 +1023,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
      const int nb = k / qk;
  
      for (int i = 0; i < nb; i++) {
-        const float d = ggml_fp16_to_fp32(x[i].d);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
  
          uint32_t qh;
          memcpy(&qh, x[i].qh, sizeof(qh));
@@ -1049,8 +1049,8 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
      const int nb = k / qk;
  
      for (int i = 0; i < nb; i++) {
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float m = ggml_fp16_to_fp32(x[i].m);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
  
          uint32_t qh;
          memcpy(&qh, x[i].qh, sizeof(qh));
@@ -1076,7 +1076,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int
      const int nb = k / qk;
  
      for (int i = 0; i < nb; i++) {
-        const float d = ggml_fp16_to_fp32(x[i].d);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
  
          for (int j = 0; j < qk; ++j) {
              y[i*qk + j] = x[i].qs[j]*d;
@@ -1387,10 +1387,10 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
                  int l = nearest_int(iscale*scales[j]);
                  y[i].scales[j] = l;
              }
-            y[i].d = ggml_fp32_to_fp16(max_scale/q4scale);
+            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
          } else {
              for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
          }
          if (max_min > 0) {
              float iscale = q4scale/max_min;
@@ -1398,14 +1398,14 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
                  int l = nearest_int(iscale*mins[j]);
                  y[i].scales[j] |= (l << 4);
              }
-            y[i].dmin = ggml_fp32_to_fp16(max_min/q4scale);
+            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
          } else {
-            y[i].dmin = ggml_fp32_to_fp16(0.f);
+            y[i].dmin = GGML_FP32_TO_FP16(0.f);
          }
          for (int j = 0; j < QK_K/16; ++j) {
-            const float d = ggml_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
              if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4);
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
              for (int ii = 0; ii < 16; ++ii) {
                  int l = nearest_int((x[16*j + ii] + dm)/d);
                  l = MAX(0, MIN(3, l));
@@ -1436,8 +1436,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
  
      for (int i = 0; i < nb; i++) {
  
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * q = x[i].qs;
  
@@ -1526,16 +1526,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
                  l >>= 4;
                  y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
              }
-            y[i].d = ggml_fp32_to_fp16(1/iscale);
+            y[i].d = GGML_FP32_TO_FP16(1/iscale);
          } else {
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
          }
  
          int8_t sc;
          for (int j = 0; j < QK_K/16; ++j) {
              sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
              sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
              if (!d) {
                  continue;
              }
@@ -1555,16 +1555,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
                  l2 = 8 + MAX(-8, MIN(7, l2));
                  y[i].scales[j/2] = l1 | (l2 << 4);
              }
-            y[i].d = ggml_fp32_to_fp16(1/iscale);
+            y[i].d = GGML_FP32_TO_FP16(1/iscale);
          } else {
              for (int j = 0; j < QK_K/16; j+=2) {
                  y[i].scales[j/2] = 0;
              }
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
          }
          for (int j = 0; j < QK_K/16; ++j) {
              int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
-            float d = ggml_fp16_to_fp32(y[i].d) * (s - 8);
+            float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
              if (!d) {
                  continue;
              }
@@ -1618,7 +1618,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
  
      for (int i = 0; i < nb; i++) {
  
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q = x[i].qs;
          const uint8_t * restrict hm = x[i].hmask;
@@ -1663,7 +1663,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
  
      for (int i = 0; i < nb; i++) {
  
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q = x[i].qs;
          const uint8_t * restrict hm = x[i].hmask;
@@ -1753,15 +1753,15 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
                  y[i].scales[j-0] |= ((lm >> 4) << 6);
              }
          }
-        y[i].d = ggml_fp32_to_fp16(max_scale/63.f);
-        y[i].dmin = ggml_fp32_to_fp16(max_min/63.f);
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
  
          uint8_t sc, m;
          for (int j = 0; j < QK_K/32; ++j) {
              get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
              if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * m;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
              for (int ii = 0; ii < 32; ++ii) {
                  int l = nearest_int((x[32*j + ii] + dm)/d);
                  l = MAX(0, MIN(15, l));
@@ -1778,17 +1778,17 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
          int m2 = nearest_int(inv_min*mins[1]);
          y[i].scales[0] = d1 | (m1 << 4);
          y[i].scales[1] = d2 | (m2 << 4);
-        y[i].d[0] = ggml_fp32_to_fp16(max_scale/s_factor);
-        y[i].d[1] = ggml_fp32_to_fp16(max_min/s_factor);
+        y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor);
+        y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor);
  
          float sumlx = 0;
          int   suml2 = 0;
          for (int j = 0; j < QK_K/32; ++j) {
              const uint8_t sd = y[i].scales[j] & 0xF;
              const uint8_t sm = y[i].scales[j] >>  4;
-            const float d = ggml_fp16_to_fp32(y[i].d[0]) * sd;
+            const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd;
              if (!d) continue;
-            const float m = ggml_fp16_to_fp32(y[i].d[1]) * sm;
+            const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm;
              for (int ii = 0; ii < 32; ++ii) {
                  int l = nearest_int((x[32*j + ii] + m)/d);
                  l = MAX(0, MIN(15, l));
@@ -1798,7 +1798,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
              }
          }
          if (suml2) {
-            y[i].d[0] = ggml_fp32_to_fp16(sumlx/suml2);
+            y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2);
          }
  #endif
          uint8_t * q = y[i].qs;
@@ -1822,8 +1822,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
  
  #if QK_K == 256
  
-        const float d   = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d   = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
  
          int is = 0;
          uint8_t sc, m;
@@ -1837,8 +1837,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
              q += 32; is += 2;
          }
  #else
-        const float dall = ggml_fp16_to_fp32(x[i].d[0]);
-        const float mall = ggml_fp16_to_fp32(x[i].d[1]);
+        const float dall = GGML_FP16_TO_FP32(x[i].d[0]);
+        const float mall = GGML_FP16_TO_FP32(x[i].d[1]);
          const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
          const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
          for (int l = 0; l < 32; ++l) {
@@ -1924,15 +1924,15 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
                  y[i].scales[j-0] |= ((lm >> 4) << 6);
              }
          }
-        y[i].d = ggml_fp32_to_fp16(max_scale/63.f);
-        y[i].dmin = ggml_fp32_to_fp16(max_min/63.f);
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
  
          uint8_t sc, m;
          for (int j = 0; j < QK_K/32; ++j) {
              get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
              if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * m;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
              for (int ii = 0; ii < 32; ++ii) {
                  int l = nearest_int((x[32*j + ii] + dm)/d);
                  l = MAX(0, MIN(31, l));
@@ -1976,10 +1976,10 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
              int l = nearest_int(iscale*scales[j]);
              y[i].scales[j] = MAX(-128, MIN(127, l));
          }
-        y[i].d = ggml_fp32_to_fp16(1/iscale);
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
  
          for (int j = 0; j < QK_K/16; ++j) {
-            const float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j];
+            const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
              if (!d) continue;
              for (int ii = 0; ii < 16; ++ii) {
                  int l = nearest_int(x[16*j + ii]/d);
@@ -2023,8 +2023,8 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
  
  #if QK_K == 256
  
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
  
          int is = 0;
          uint8_t sc, m;
@@ -2040,7 +2040,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
              u1 <<= 2; u2 <<= 2;
          }
  #else
-        float d = ggml_fp16_to_fp32(x[i].d);
+        float d = GGML_FP16_TO_FP32(x[i].d);
          const int8_t * restrict s = x[i].scales;
          for (int l = 0; l < 8; ++l) {
              y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
@@ -2103,19 +2103,19 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
  
          if (!max_abs_scale) {
              memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
              x += QK_K;
              continue;
          }
  
          float iscale = -128.f/max_scale;
-        y[i].d = ggml_fp32_to_fp16(1/iscale);
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
          for (int ib = 0; ib < QK_K/16; ++ib) {
              y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
          }
  
          for (int j = 0; j < QK_K/16; ++j) {
-            float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j];
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
              if (!d) {
                  continue;
              }
@@ -2164,7 +2164,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
  
      for (int i = 0; i < nb; i++) {
  
-        const float d = ggml_fp16_to_fp32(x[i].d);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict ql = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -2371,8 +2371,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
          const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  #else
          const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
          const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2389,8 +2389,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
          const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  #endif
      }
  
@@ -2402,7 +2402,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
      // Main loop
      for (int i = 0; i < nb; ++i) {
          /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) );
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
  
          __m256i bx = bytes_from_nibbles_32(x[i].qs);
  
@@ -2426,7 +2426,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
      // Main loop
      for (int i = 0; i < nb; ++i) {
          // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) );
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
  
          const __m128i lowMask = _mm_set1_epi8(0xF);
          const __m128i off = _mm_set1_epi8(8);
@@ -2468,7 +2468,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
  
          // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[0].d) * ggml_fp16_to_fp32(y[0].d) );
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
  
          const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
  
@@ -2486,7 +2486,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
  
          // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[1].d) * ggml_fp16_to_fp32(y[1].d) );
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
  
          const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
  
@@ -2521,7 +2521,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
  
          // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) );
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
  
          const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
  
@@ -2539,7 +2539,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
          _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
  
          // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[i + 1].d) * ggml_fp16_to_fp32(y[i + 1].d) );
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
  
          const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
  
@@ -2606,7 +2606,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
  
          int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
  
-        sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d);
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
      }
  
      *s = sumf;
@@ -2624,7 +2624,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
              sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
          }
  
-        sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d);
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
      }
  
      *s = sumf;
@@ -2655,7 +2655,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
          const block_q8_1 * restrict y0 = &y[i + 0];
          const block_q8_1 * restrict y1 = &y[i + 1];
  
-        summs += ggml_fp16_to_fp32(x0->m) * y0->s + ggml_fp16_to_fp32(x1->m) * y1->s;
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
  
          const uint8x16_t m4b = vdupq_n_u8(0x0F);
  
@@ -2679,8 +2679,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
          const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
          const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
  #else
          const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
          const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
@@ -2697,8 +2697,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
          const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
          const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
  #endif
      }
  
@@ -2711,10 +2711,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
  
      // Main loop
      for (int i = 0; i < nb; ++i) {
-        const float d0 = ggml_fp16_to_fp32(x[i].d);
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
          const float d1 = y[i].d;
  
-        summs += ggml_fp16_to_fp32(x[i].m) * y[i].s;
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
  
          const __m256 d0v = _mm256_set1_ps( d0 );
          const __m256 d1v = _mm256_set1_ps( d1 );
@@ -2766,7 +2766,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
  
          int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
      }
  
      *s = sumf;
@@ -2784,7 +2784,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
              sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
          }
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
      }
  
      *s = sumf;
@@ -2864,10 +2864,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
  #if defined(__ARM_FEATURE_DOTPROD)
          sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
          sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  #else
          const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
          const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2884,8 +2884,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
          const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
          const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  #endif
      }
  
@@ -2946,7 +2946,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
                                             wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                              wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                             wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * ggml_fp16_to_fp32(y0->d))));
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
      }
  
      *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2958,7 +2958,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
      // Main loop
      for (int i = 0; i < nb; i++) {
          /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d));
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
  
          __m256i bx = bytes_from_nibbles_32(x[i].qs);
          __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2982,7 +2982,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
      // Main loop
      for (int i = 0; i < nb; i++) {
          /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d));
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
  
          __m256i bx = bytes_from_nibbles_32(x[i].qs);
          const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3066,7 +3066,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
  
          int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
      }
  
      *s = sumf;
@@ -3090,7 +3090,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
              sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
          }
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
      }
  
      *s = sumf;
@@ -3130,8 +3130,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
  
          const uint8x16_t m4b = vdupq_n_u8(0x0F);
  
-        summs0 += ggml_fp16_to_fp32(x0->m) * y0->s;
-        summs1 += ggml_fp16_to_fp32(x1->m) * y1->s;
+        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
  
          // extract the 5th bit via lookup table ((b) << 4)
          memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -3176,10 +3176,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
  #if defined(__ARM_FEATURE_DOTPROD)
          sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*y0->d);
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
          sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*y1->d);
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
  #else
          const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
          const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -3196,8 +3196,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
          const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
          const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
  #endif
      }
  
@@ -3215,7 +3215,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
          const block_q5_1 * restrict x0 = &x[i];
          const block_q8_1 * restrict y0 = &y[i];
  
-        summs += ggml_fp16_to_fp32(x0->m) * y0->s;
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
  
          const v128_t m4b = wasm_i8x16_splat(0x0F);
  
@@ -3262,7 +3262,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
                                             wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                              wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                             wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * y0->d)));
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
      }
  
      *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -3275,9 +3275,9 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
  
      // Main loop
      for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d));
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
  
-        summs += ggml_fp16_to_fp32(x[i].m) * y[i].s;
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
  
          __m256i bx = bytes_from_nibbles_32(x[i].qs);
          __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3302,9 +3302,9 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
  
      // Main loop
      for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d));
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
  
-        summs += ggml_fp16_to_fp32(x[i].m) * y[i].s;
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
  
          __m256i bx = bytes_from_nibbles_32(x[i].qs);
          const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3385,7 +3385,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
  
          int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
      }
  
      *s = sumf;
@@ -3409,7 +3409,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
              sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
          }
  
-        sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
      }
  
      *s = sumf;
@@ -3451,11 +3451,11 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
  #if defined(__ARM_FEATURE_DOTPROD)
          sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
+                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
  
          sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                          vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  
  #else
          const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
@@ -3473,8 +3473,8 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
          const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
          const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
  
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
  #endif
      }
  
@@ -3486,7 +3486,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
      // Main loop
      for (int i = 0; i < nb; ++i) {
          // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d));
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
          __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
          __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
  
@@ -3517,7 +3517,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
  
          int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
  
-        sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d));
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
      }
  
      *s = sumf;
@@ -3532,7 +3532,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
              sumi += x[i].qs[j]*y[i].qs[j];
          }
  
-        sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d));
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
      }
  
      *s = sumf;
@@ -3562,8 +3562,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -3641,8 +3641,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -3708,8 +3708,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -3816,8 +3816,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
          const  int8_t * q8 = y[i].qs;
          const uint8_t * sc = x[i].scales;
  
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          size_t vl = 16;
  
@@ -3903,8 +3903,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
              summs += y[i].bsums[j] * (sc[j] >> 4);
          }
  
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          int isum = 0;
          int is = 0;
@@ -4021,8 +4021,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -4073,8 +4073,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q2 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -4188,8 +4188,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
              summs += y[i].bsums[j] * (sc[j] >> 4);
          }
  
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          isum[0] = isum[1] = isum[2] = isum[3] = 0;
          for (int l =  0; l < 16; ++l) {
@@ -4242,7 +4242,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q3 = x[i].qs;
          const uint8_t * restrict qh = x[i].hmask;
@@ -4350,7 +4350,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q3 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -4455,7 +4455,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q3 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -4676,7 +4676,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
          }
  
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
  
          sumf += d*sum_t;
  
@@ -4741,7 +4741,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
              q8 += 8; a += 8;
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -4843,7 +4843,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q3 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -4914,7 +4914,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q3 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -5099,7 +5099,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
              q8 += 8; a += 8;
              for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -5139,8 +5139,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
  
@@ -5222,8 +5222,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
     for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          memcpy(utmp, x[i].scales, 12);
          utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5288,8 +5288,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
     for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q4 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -5371,8 +5371,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
          size_t vl = 8;
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
          vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -5482,9 +5482,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
              q8 += 8; a += 8;
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
          sumf -= dmin * sumi;
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -5586,8 +5586,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
-        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
          const __m256 vd = _mm256_set1_ps(d);
  
          const uint16_t * a = (const uint16_t *)x[i].scales;
@@ -5632,8 +5632,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
-        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
          const __m256 vd = _mm256_set1_ps(d);
  
          const uint16_t * a = (const uint16_t *)x[i].scales;
@@ -5689,8 +5689,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
          s16[0] = b[0] & 0x0f0f;
          s16[1] = (b[0] >> 4) & 0x0f0f;
  
-        sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
+        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
  
          size_t vl = 32;
  
@@ -5739,9 +5739,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
          s16[0] = b[0] & 0x0f0f;
          s16[1] = (b[0] >> 4) & 0x0f0f;
  
-        sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
  
          for (int j = 0; j < QK_K/32; ++j) {
              for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
@@ -5789,8 +5789,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
  
@@ -5878,8 +5878,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
          const int8_t  * restrict q8 = y[i].qs;
  
  #if QK_K == 256
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          memcpy(utmp, x[i].scales, 12);
          utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5960,8 +5960,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
  
          const uint8_t * restrict q5 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
@@ -6065,8 +6065,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
          const uint8_t * restrict hm = x[i].qh;
          const  int8_t * restrict q8 = y[i].qs;
  
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
  
          vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
          vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -6188,9 +6188,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
              q8 += 8; a += 8;
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
          sumf -= dmin * sumi;
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -6288,7 +6288,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
          const uint8_t * restrict q5 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
  
@@ -6334,7 +6334,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
          const uint8_t * restrict q5 = x[i].qs;
          const int8_t  * restrict q8 = y[i].qs;
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
  
@@ -6471,7 +6471,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
          }
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
          const int8_t * restrict sc = x[i].scales;
  
          for (int j = 0; j < QK_K/16; ++j) {
@@ -6514,7 +6514,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q6 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -6646,7 +6646,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q4 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -6726,7 +6726,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q4 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -6838,7 +6838,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
      float sumf = 0;
      for (int i = 0; i < nb; ++i) {
  
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
  
          const uint8_t * restrict q6 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -6955,7 +6955,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
              q8 += 8; a += 8;
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -7053,7 +7053,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q4 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -7110,7 +7110,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
  
      for (int i = 0; i < nb; ++i) {
  
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
  
          const uint8_t * restrict q4 = x[i].ql;
          const uint8_t * restrict qh = x[i].qh;
@@ -7269,7 +7269,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
              for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
              q8 += 8; a += 8;
          }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
          for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
      }
      for (int l = 0; l < 8; ++l) sumf += sums[l];
diff --git a/ggml-quants.h b/ggml-quants.h

index d88f99e331f1dd2bf9644d0ae8c1e70cff88377f..70c12c27465e8062f824166fb29d1a18eb7bc128 100644 (file)
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -1,22 +1,12 @@
  #pragma once
  
-// This is a private API for quantization and dequantization
-// Should not be used directly, use ggml.h instead
+#include "ggml-impl.h"
  
-#include "ggml.h"
+// GGML internal header
  
  #include <stdint.h>
-#include <assert.h>
  #include <stddef.h>
  
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
  #define QK4_0 32
  typedef struct {
      ggml_fp16_t d;          // delta
diff --git a/ggml.c b/ggml.c

index 95f72c35e8f205a8d10e2c7559017f2417afa226..84407b1224226fed3267876563afa952511e725d 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -1,6 +1,6 @@
  #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-quants.h"
  
  #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -27,18 +27,6 @@
  #include <unistd.h>
  #endif
  
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
  #if defined(_MSC_VER)
  // disable "possible loss of data" to avoid hundreds of casts
  // we should just be careful :)
@@ -106,23 +94,11 @@ typedef void * thread_ret_t;
  #include <unistd.h>
  
  #endif
+
  #ifdef GGML_USE_CPU_HBM
  #include <hbwmalloc.h>
  #endif
  
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
  /*#define GGML_PERF*/
  #define GGML_DEBUG 0
  #define GGML_GELU_FP16
@@ -248,213 +224,27 @@ inline static void * ggml_aligned_malloc(size_t size) {
  #include "ggml-opencl.h"
  #endif
  
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
  // floating point type used to accumulate sums
  typedef double ggml_float;
  
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
  //
  // global data
  //
  
  // precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_f16[1 << 16];
+static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
  
  // precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_quick_f16[1 << 16];
+static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
  
  // precomputed silu table for f16 (128 KB)
-static ggml_fp16_t table_silu_f16[1 << 16];
+static ggml_fp16_t ggml_table_silu_f16[1 << 16];
  
  // precomputed exp table for f16 (128 KB)
-static ggml_fp16_t table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB)
-static float table_f32_f16[1 << 16];
+static ggml_fp16_t ggml_table_exp_f16[1 << 16];
  
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_table_f32_f16[1 << 16];
  
  // note: do not use these inside ggml.c
  // these are meant to be used via the ggml.h API
@@ -632,6 +422,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
          .vec_dot_type             = GGML_TYPE_Q8_1,
      },
+    [4] = { // GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    },
+    [5] = { // GGML_TYPE_Q4_3
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    },
      [GGML_TYPE_Q5_0] = {
          .type_name                = "q5_0",
          .blck_size                = QK5_0,
@@ -1551,7 +1363,7 @@ inline static float ggml_gelu_f32(float x) {
  inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
      const uint16_t * i16 = (const uint16_t *) x;
      for (int i = 0; i < n; ++i) {
-        y[i] = table_gelu_f16[i16[i]];
+        y[i] = ggml_table_gelu_f16[i16[i]];
      }
  }
  
@@ -1561,7 +1373,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
      for (int i = 0; i < n; ++i) {
          ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
          memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
      }
  }
  #else
@@ -1579,7 +1391,7 @@ inline static float ggml_gelu_quick_f32(float x) {
  //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  //    const uint16_t * i16 = (const uint16_t *) x;
  //    for (int i = 0; i < n; ++i) {
-//        y[i] = table_gelu_quick_f16[i16[i]];
+//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
  //    }
  //}
  
@@ -1589,7 +1401,7 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
      for (int i = 0; i < n; ++i) {
          ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
          memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
      }
  }
  #else
@@ -1608,7 +1420,7 @@ inline static float ggml_silu_f32(float x) {
  //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  //    const uint16_t * i16 = (const uint16_t *) x;
  //    for (int i = 0; i < n; ++i) {
-//        y[i] = table_silu_f16[i16[i]];
+//        y[i] = ggml_table_silu_f16[i16[i]];
  //    }
  //}
  
@@ -1618,7 +1430,7 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
      for (int i = 0; i < n; ++i) {
          ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
          memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
      }
  }
  #else
@@ -2334,11 +2146,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
              for (int i = 0; i < (1 << 16); ++i) {
                  uint16_t ui = i;
                  memcpy(&ii, &ui, sizeof(ii));
-                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
+                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
+                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
+                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
              }
  
              const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -10701,7 +10513,7 @@ static void ggml_compute_forward_soft_max_f32(
                  // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
                  ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
                  memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
                  sum += (ggml_float)val;
                  dp[i] = val;
              }
@@ -12990,7 +12802,7 @@ static void ggml_compute_forward_flash_attn_f32(
  #else
                              ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                              memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
  #endif
                              sump[j] += (ggml_float)val;
                              SS[j] = val;
@@ -13192,7 +13004,7 @@ static void ggml_compute_forward_flash_attn_f16(
                          } else {
                              ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                              memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
                              sump[j] += (ggml_float)val;
                              SS[j] = val;
                          }
@@ -13643,7 +13455,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
  #else
                                      ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                      memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
  #endif
                                      sump[j] += (ggml_float)val;
                                      SW[j] = val;
@@ -14393,7 +14205,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
  #else
                      ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                      memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
  #endif
                      sum += (ggml_float)val;
                      st[i] = val;
@@ -14507,7 +14319,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
  #else
                      ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                      memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
  #endif
                      sum += (ggml_float)val;
                      ds0[i] = val;
diff --git a/llama.cpp b/llama.cpp

index a4340d5277b09c0c63185f5e09ae9044295e07ac..e599917a81eb1d27b4768c67e0b83bcb3296f9ed 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1467,7 +1467,7 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
  }
  
  static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
-    for (int32_t i = 0; i < cache.size; ++i) {
+    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
          cache.cells[i].pos = -1;
          cache.cells[i].seq_id.clear();
      }
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp

index afd7bf77fcb552472e594d4bce4b72b278a3f8a9..753dae911b0cb3da0f8a22abf3b26fca4aad53c0 100644 (file)
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,7 @@
  
  #undef NDEBUG
  #include <cassert>
-#if !defined(__riscv) && !defined(__s390__)
+#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
  #include <immintrin.h>
  #endif
  #include <cmath>
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp

index 884af40548fb7912cd2e80c3c7e503bba938c06b..a2459a2867c5c087d27f2a390f96e852b7493207 100644 (file)
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
          ggml_type type = (ggml_type) i;
          ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
  
+        // deprecated - skip
+        if (qfns.blck_size == 0) {
+            continue;
+        }
+
+        printf("Testing %s\n", ggml_type_name((ggml_type) i));
+
          if (qfns.from_float && qfns.to_float) {
              const float total_error = total_quantization_error(qfns, test_size, test_data.data());
              const float max_quantization_error =
author	Georgi Gerganov <redacted>
	Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)
committer	GitHub <redacted>
	Mon, 30 Oct 2023 17:19:15 +0000 (19:19 +0200)
ggml-impl.h	[new file with mode: 0644]	patch \| blob
ggml-quants.c		patch \| blob \| history
ggml-quants.h		patch \| blob \| history
ggml.c		patch \| blob \| history
llama.cpp		patch \| blob \| history
tests/test-double-float.cpp		patch \| blob \| history
tests/test-quantize-fns.cpp		patch \| blob \| history