#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
#define GGML_F32x4_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
} \
res = GGML_F32x4_REDUCE_ONE(x[0]); \
}
#define GGML_F16x8_MUL vmulq_f16
#define GGML_F16x8_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
} \
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
#define GGML_F32x8_MUL _mm256_mul_ps
#define GGML_F32x8_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
} \
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
_mm256_extractf128_ps(x[0], 1)); \
#define GGML_F32x4_MUL vec_mul
#define GGML_F32x4_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
} \
res = vec_extract(x[0], 0) + \
vec_extract(x[0], 1) + \
#define GGML_F32x4_MUL wasm_f32x4_mul
#define GGML_F32x4_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
res = wasm_f32x4_extract_lane(x[0], 0) + \
wasm_f32x4_extract_lane(x[0], 1) + \
#define GGML_F16x4_MUL wasm_f32x4_mul
#define GGML_F16x4_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
} \
res = wasm_f32x4_extract_lane(x[0], 0) + \
wasm_f32x4_extract_lane(x[0], 1) + \
#define GGML_F32x4_MUL _mm_mul_ps
#define GGML_F32x4_REDUCE(res, x) \
{ \
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
} \
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
} \
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \