}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ // Main loop
+ for (int i = 0; i < nb; ++i) {
+ // Compute combined scale for the block
+ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+ __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+ __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+ // Multiply q with scale and accumulate
+ acc = _mm256_fmadd_ps( d, q, acc );
+ }
+
+ *s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;