ggml : fix AVX build + update to new Q8_0 format

author Georgi Gerganov <redacted>

Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)

committer Georgi Gerganov <redacted>

Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)
author Georgi Gerganov <redacted>
Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)
committer Georgi Gerganov <redacted>
Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)
diff --git a/Makefile b/Makefile

index 3b48eec9906e4a5ddc9bd37c81d788ad87812b28..b297959c937da30627a994b2bdbc533d968714d7 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -74,13 +74,17 @@ endif
  #       feel free to update the Makefile for your architecture and send a pull request or issue
  ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
         # Use all CPU extensions that are available:
-       CFLAGS += -march=native -mtune=native
+       CFLAGS   += -march=native -mtune=native
         CXXFLAGS += -march=native -mtune=native
+
+       # Usage AVX-only
+       #CFLAGS   += -mfma -mf16c -mavx
+       #CXXFLAGS += -mfma -mf16c -mavx
  endif
  ifneq ($(filter ppc64%,$(UNAME_M)),)
         POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
         ifneq (,$(findstring POWER9,$(POWER9_M)))
-               CFLAGS += -mcpu=power9
+               CFLAGS   += -mcpu=power9
                 CXXFLAGS += -mcpu=power9
         endif
         # Require c++23's std::byteswap for big-endian support.
@@ -114,7 +118,7 @@ ifdef LLAMA_GPROF
         CXXFLAGS += -pg
  endif
  ifneq ($(filter aarch64%,$(UNAME_M)),)
-       CFLAGS += -mcpu=native
+       CFLAGS   += -mcpu=native
         CXXFLAGS += -mcpu=native
  endif
  ifneq ($(filter armv6%,$(UNAME_M)),)
diff --git a/ggml.c b/ggml.c

index 72b392fdb87e1a7c9b07856420842fdfb1d79532..46c0292fe1ce397ae7f695af85331e4795d27612 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -468,6 +468,14 @@ static inline int hsum_i32_8(const __m256i a) {
      return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
  }
  
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
  #if __AVX2__ || __AVX512F__
  // Unpack 32 4-bit fields into 32 bytes
  // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
@@ -1381,7 +1389,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
          y[i].s1 = d * sum1;
      }
  #elif defined(__AVX2__) || defined(__AVX__)
-    // TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
      for (int i = 0; i < nb; i++) {
          // Load elements into 4 AVX vectors
          __m256 v0 = _mm256_loadu_ps( x );
@@ -1460,7 +1467,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
          // Compute the sum of the quants and set y[i].s
          const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
          const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_8(_mm256_set_m128i(s1, s0));
+        y[i].s0 = d * hsum_i32_4(s0);
+        y[i].s1 = d * hsum_i32_4(s1);
  
          // Convert int32 to int16
          ni0 = _mm_packs_epi32( ni0, ni1 );
diff --git a/llama.cpp b/llama.cpp

index 00cce6e2add689076f325de24ad3ad165d01c1d0..4e92f551585a6116dd45f1ebd3f1ab00134bbc7c 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -68,7 +68,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
          { MODEL_65B,   512ull * MB },
      };
      return _MEM_REQ_SCRATCH1;
-};
+}
  
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
          { MODEL_65B,  5120ull * MB },
      };
      return _MEM_REQ_KV_SELF;
-};
+}
  
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
  // not actually needed if BLAS is disabled
@@ -93,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
          { MODEL_65B, 1536ull * MB },
      };
      return _MEM_REQ_EVAL;
-};
+}
  
  // default hparams (LLaMA 7B)
  struct llama_hparams {
author	Georgi Gerganov <redacted>
	Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)
committer	Georgi Gerganov <redacted>
	Sat, 22 Apr 2023 08:08:12 +0000 (11:08 +0300)
Makefile		patch \| blob \| history
ggml.c		patch \| blob \| history
llama.cpp		patch \| blob \| history