ggml : add llamafile sgemm (llama/6414)

author Justine Tunney <redacted>

Tue, 16 Apr 2024 18:55:30 +0000 (14:55 -0400)

committer Georgi Gerganov <redacted>

Sat, 11 May 2024 18:30:08 +0000 (21:30 +0300)
author Justine Tunney <redacted>
Tue, 16 Apr 2024 18:55:30 +0000 (14:55 -0400)
committer Georgi Gerganov <redacted>
Sat, 11 May 2024 18:30:08 +0000 (21:30 +0300)
diff --git a/src/ggml-impl.h b/src/ggml-impl.h

index e68b728775c414e4d2a24eaa81f2034c0141374f..0c997d3ed521f240b2d5d430db79f83ecb82a5c1 100644 (file)
--- a/src/ggml-impl.h
+++ b/src/ggml-impl.h
@@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <intrin.h>
  #else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
  #if !defined(__riscv)
  #include <immintrin.h>
  #endif
diff --git a/src/ggml-quants.c b/src/ggml-quants.c

index 029511a60748a8cbb88179cee9995e95b436a017..4be9575e0c101459686cf5517e7810375983b863 100644 (file)
--- a/src/ggml-quants.c
+++ b/src/ggml-quants.c
@@ -138,7 +138,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
  }
  
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
      const __m256i zero = _mm256_setzero_si256();
      const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
      return _mm256_cvtepi32_ps(summed_pairs);
diff --git a/src/ggml.c b/src/ggml.c

index ba06665a5364c433f571575a0da0f59371a4ccfd..c5280e718cf489fa9215211634c01ab31ea7c659 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4,6 +4,7 @@
  #include "ggml-impl.h"
  #include "ggml-quants.h"
  #include "ggml.h"
+#include "sgemm.h"
  
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,14 @@
  #include <unistd.h>
  #endif
  
+#ifndef GGML_USE_LLAMAFILE
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#define GGML_USE_LLAMAFILE 0
+#else
+#define GGML_USE_LLAMAFILE 1
+#endif
+#endif
+
  #if defined(_MSC_VER)
  // disable "possible loss of data" to avoid hundreds of casts
  // we should just be careful :)
@@ -10872,6 +10881,28 @@ static void ggml_compute_forward_mul_mat(
      }
  #endif
  
+#if GGML_USE_LLAMAFILE
+    if (nb10 == ggml_type_size(src1->type)) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)src1->data + i12*nb12 + i13*nb13,
+                                     nb11/ggml_type_size(src1->type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     params->type,
+                                     src0->type,
+                                     src1->type,
+                                     dst->type))
+                    goto UseGgmlGemm1;
+        return;
+    }
+UseGgmlGemm1:;
+#endif
+
      if (params->type == GGML_TASK_TYPE_INIT) {
          if (ith != 0) {
              return;
@@ -10903,6 +10934,29 @@ static void ggml_compute_forward_mul_mat(
      const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
      const size_t row_size = ggml_row_size(vec_dot_type, ne10);
  
+#if GGML_USE_LLAMAFILE
+    if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)wdata + (nb12/ggml_type_size(src1->type)*ggml_type_size(vec_dot_type)*i12 +
+                                                            nb13/ggml_type_size(src1->type)*ggml_type_size(vec_dot_type)*i13),
+                                     row_size/ggml_type_size(vec_dot_type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     params->type,
+                                     src0->type,
+                                     vec_dot_type,
+                                     dst->type))
+                    goto UseGgmlGemm2;
+        return;
+    }
+UseGgmlGemm2:;
+#endif
+
      const int64_t nr0 = ne01;          // src0 rows
      const int64_t nr1 = ne1*ne12*ne13; // src1 rows
author	Justine Tunney <redacted>
	Tue, 16 Apr 2024 18:55:30 +0000 (14:55 -0400)
committer	Georgi Gerganov <redacted>
	Sat, 11 May 2024 18:30:08 +0000 (21:30 +0300)
src/ggml-impl.h		patch \| blob \| history
src/ggml-quants.c		patch \| blob \| history
src/ggml.c		patch \| blob \| history