set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE
- AVX AVX2 AVX_VNNI FMA F16C
+ AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
ggml_add_cpu_backend_variant(sandybridge AVX)
- ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
- ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
- ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
+ ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
+ ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
+ ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+ ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
if (NOT MSVC)
# MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+ ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
__m256i sumi = _mm256_setzero_si256();
int sumi1 = 0;
for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+#else
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+#endif
qs += 8;
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
#elif defined __AVX2__
- const __m256i mask = _mm256_set1_epi16(0x7);
+ const __m256i mask = _mm256_set1_epi16(2 * 0x7);
const __m256i mone = _mm256_set1_epi16(1);
+ const __m256i mone8 = _mm256_set1_epi8(1);
__m256 accum1 = _mm256_setzero_ps();
__m256 accum2 = _mm256_setzero_ps();
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
+#else
const __m256i q1b_1 = _mm256_set_epi64x(
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
);
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
- const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
- const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+#endif
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
- const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
- const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
- __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
- __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
- scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
- scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
+ scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
+ scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);