//
#include <arm_neon.h>
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
- return x;
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
- return x;
-}
+#define GGML_COMPUTE_FP16_TO_FP32(x) (x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
#define GGML_FP16_TO_FP32(x) (x)
#define GGML_FP32_TO_FP16(x) (x)
#endif
#ifdef __F16C__
-float ggml_fp16_to_fp32(ggml_fp16_t h) {
- return _cvtsh_ss(h);
-}
-ggml_fp16_t ggml_fp32_to_fp16(float f) {
- return _cvtss_sh(f, 0);
-}
-#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#else
return fp32.as_bits;
}
-float ggml_fp16_to_fp32(ggml_fp16_t h) {
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
return fp32_from_bits(result);
}
-ggml_fp16_t ggml_fp32_to_fp16(float f) {
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
-#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
// precomputed exp table for f16 (128 KB)
static ggml_fp16_t table_exp_f16[1 << 16];
+// precomputed f32 table for f16 (256 KB)
+static float table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ uint16_t s;
+ memcpy(&s, &f, sizeof(uint16_t));
+ return table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+// note: do not use these inside ggml.c
+// these are meant to be used via the ggml.h API
+float ggml_fp16_to_fp32(ggml_fp16_t x) {
+ return GGML_FP16_TO_FP32(x);
+}
+
+ggml_fp16_t ggml_fp32_to_fp16(float x) {
+ return GGML_FP32_TO_FP16(x);
+}
+
//
// timing
//
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 __m128
+#define GGML_F32x4_ZERO _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD _mm_loadu_ps
+#define GGML_F32x4_STORE _mm_storeu_ps
+#if defined(__FMA__)
+ // TODO: Does this work?
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD _mm_add_ps
+#define GGML_F32x4_MUL _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
+ x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
+ x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
+ } \
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
+ x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
+ } \
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
+ res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 4
+
+static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+ float tmp[4];
+
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+ return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+ float arr[4];
+
+ _mm_storeu_ps(arr, y);
+
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4 __m128
+#define GGML_F32Cx4_ZERO _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD _mm_add_ps
+#define GGML_F32Cx4_MUL _mm_mul_ps
+#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx4
+#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+
#endif
// GGML_F32_ARR / GGML_F16_ARR
static bool is_first_call = true;
if (is_first_call) {
- // initialize GELU and EXP tables
+ // initialize GELU, EXP and F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii));
- const float f = GGML_FP16_TO_FP32(ii);
+ const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
}
#endif
}
+int ggml_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__)
return 1;