ggml : fix 32-bit ARM build + quantization

author Georgi Gerganov <redacted>

Tue, 2 May 2023 18:47:12 +0000 (21:47 +0300)

committer Georgi Gerganov <redacted>

Tue, 2 May 2023 18:52:26 +0000 (21:52 +0300)
author Georgi Gerganov <redacted>
Tue, 2 May 2023 18:47:12 +0000 (21:47 +0300)
committer Georgi Gerganov <redacted>
Tue, 2 May 2023 18:52:26 +0000 (21:52 +0300)
diff --git a/Makefile b/Makefile

index 44aec46dcd11266745cac7914e42076af9fff888..87ce8c415e88c23e8d5e69a01917a2ac6c7a7506 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -177,7 +177,7 @@ ifdef WHISPER_GPROF
  endif
  
  ifneq ($(filter aarch64%,$(UNAME_M)),)
-       CFLAGS += -mcpu=native
+       CFLAGS   += -mcpu=native
         CXXFLAGS += -mcpu=native
  endif
  
@@ -188,15 +188,18 @@ endif
  
  ifneq ($(filter armv7%,$(UNAME_M)),)
         # 32-bit ARM, for example on Armbian or possibly raspbian
-       CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+       #CFLAGS   += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+       #CXXFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
  
-       # 64-bit ARM, use these (TODO: auto-detect 64-bit)
-       # CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+       # 64-bit ARM on 32-bit OS, use these (TODO: auto-detect 64-bit)
+       CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+       CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
  endif
  
  ifneq ($(filter armv8%,$(UNAME_M)),)
         # Raspberry Pi 4
-       CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+       CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+       CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
  endif
  
  #
diff --git a/ggml.c b/ggml.c

index 91b3053dd23fe8be057e1adedb0ce79487afb29f..addf0c308078c8536d6b961a1b31e49725089d32 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -671,35 +671,91 @@ float vmaxvq_f32(float32x4_t v) {
  }
  
  int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
-    return vget_low_s8(vcombine_s8(a, b));
+    int8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
  }
  
  int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
-    return vget_high_s8(vcombine_s8(a, b));
+    int8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
  }
  
  uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    return vget_low_u8(vcombine_u8(a, b));
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
  }
  
  uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    return vget_high_u8(vcombine_u8(a, b));
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
  }
  
  int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
-    return vcombine_s8(vget_low_s8(a), vget_low_s8(b));
+    int8x16_t res;
+
+    res[0]  = a[0]; res[1]  = b[0]; res[2]  = a[1]; res[3]  = b[1];
+    res[4]  = a[2]; res[5]  = b[2]; res[6]  = a[3]; res[7]  = b[3];
+    res[8]  = a[4]; res[9]  = b[4]; res[10] = a[5]; res[11] = b[5];
+    res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
+
+    return res;
  }
  
  int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
-    return vcombine_s8(vget_high_s8(a), vget_high_s8(b));
+    int8x16_t res;
+
+    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
+    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
+    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
+    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
+
+    return res;
  }
  
  uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-    return vcombine_u8(vget_low_u8(a), vget_low_u8(b));
+    uint8x16_t res;
+
+    res[0]  = a[0];  res[1]  = b[0];  res[2]  = a[1];  res[3]  = b[1];
+    res[4]  = a[2];  res[5]  = b[2];  res[6]  = a[3];  res[7]  = b[3];
+    res[8]  = a[4];  res[9]  = b[4];  res[10] = a[5];  res[11] = b[5];
+    res[12] = a[6];  res[13] = b[6];  res[14] = a[7];  res[15] = b[7];
+
+    return res;
  }
  
  uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-    return vcombine_u8(vget_high_u8(a), vget_high_u8(b));
+    uint8x16_t res;
+
+    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
+    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
+    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
+    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
+
+    return res;
  }
  
  int32x4_t vcvtnq_s32_f32(float32x4_t v) {
author	Georgi Gerganov <redacted>
	Tue, 2 May 2023 18:47:12 +0000 (21:47 +0300)
committer	Georgi Gerganov <redacted>
	Tue, 2 May 2023 18:52:26 +0000 (21:52 +0300)