//
#include <arm_neon.h>
+typedef __fp16 ggml_fp16_internal_t;
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- __fp16 tmp;
+ ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
- __fp16 tmp = f;
+ ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
+typedef uint16_t ggml_fp16_internal_t;
+
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
- const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
- const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
- const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
- const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
#define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
- #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
#define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
#define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32