#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR 16
+
+#define GGML_F32x16 __m512
+#define GGML_F32x16_ZERO _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD _mm512_loadu_ps
+#define GGML_F32x16_STORE _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD _mm512_add_ps
+#define GGML_F32x16_MUL _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x16
+#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR 16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16 __m512
+#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
+
+// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD _mm512_add_ps
+#define GGML_F32Cx16_MUL _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+#define GGML_F16_VEC GGML_F32Cx16
+#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
+
#elif defined(__AVX__)
#define GGML_SIMD