ggml : tiny ggml_vec_dot_q4_K_q8_K AVX2 improvement (#2819)

author Ronny Brendel <redacted>

Mon, 28 Aug 2023 12:51:08 +0000 (14:51 +0200)

committer GitHub <redacted>

Mon, 28 Aug 2023 12:51:08 +0000 (15:51 +0300)
author Ronny Brendel <redacted>
Mon, 28 Aug 2023 12:51:08 +0000 (14:51 +0200)
committer GitHub <redacted>
Mon, 28 Aug 2023 12:51:08 +0000 (15:51 +0300)
diff --git a/k_quants.c b/k_quants.c

index 82bf816976c00c8a365bd4252d1223102366c94d..3a9b1dafdb34540fec35aa47b8a7acd1031d1d71 100644 (file)
--- a/k_quants.c
+++ b/k_quants.c
@@ -2694,13 +2694,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
              const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
              __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
              p16l = _mm256_madd_epi16(scale_l, p16l);
-            sumi = _mm256_add_epi32(sumi, p16l);
  
              const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
              __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
              p16h = _mm256_madd_epi16(scale_h, p16h);
-            sumi = _mm256_add_epi32(sumi, p16h);
+            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
  
+            sumi = _mm256_add_epi32(sumi, sumj);
          }
  
          __m256 vd = _mm256_set1_ps(d);
author	Ronny Brendel <redacted>
	Mon, 28 Aug 2023 12:51:08 +0000 (14:51 +0200)
committer	GitHub <redacted>
	Mon, 28 Aug 2023 12:51:08 +0000 (15:51 +0300)