ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+ ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
if (NOT MSVC)
- # MSVC doesn't support AVX-VNNI or AMX
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
+ # MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
else ()
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif()
if (GGML_AVX_VNNI)
- # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
- #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+ list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
endif()
else ()
if (GGML_NATIVE)
}
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
const __m256i zero = _mm256_setzero_si256();
return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+ const __m256i zero = _mm256_setzero_si256();
+ return _mm256_dpbusd_avx_epi32(zero, ax, sy);
#else
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
}
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+ return _mm256_cvtepi32_ps(summed_pairs);
#else
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
inline __m256 updot(__m256i u, __m256i s) {
__m256i res;
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
+#elif defined(__AVXVNNI__)
+ res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
#else
res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
#endif