ggml : reading the runtime sve config of the cpu (llama/8709)

author jdomke <redacted>

Sat, 3 Aug 2024 16:34:41 +0000 (01:34 +0900)

committer Georgi Gerganov <redacted>

Thu, 8 Aug 2024 10:45:29 +0000 (13:45 +0300)
author jdomke <redacted>
Sat, 3 Aug 2024 16:34:41 +0000 (01:34 +0900)
committer Georgi Gerganov <redacted>
Thu, 8 Aug 2024 10:45:29 +0000 (13:45 +0300)
diff --git a/src/ggml-aarch64.c b/src/ggml-aarch64.c

index 90bad531ee6153c4809d02e8ae56bb3b9cac5ba4..7adaadc92d70ac2a792d3971a268f74e50d9ecd3 100644 (file)
--- a/src/ggml-aarch64.c
+++ b/src/ggml-aarch64.c
@@ -386,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
      }
  #endif
@@ -498,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
      }
  #endif
@@ -616,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
          const void * b_ptr = vx;
          const void * a_ptr = vy;
          float * res_ptr = s;
@@ -682,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
          return;
      }
      else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                      "performance");
      }
      else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                      "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                      "quantization format for optimal performance");
      }
@@ -747,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
      }
  #endif
@@ -1268,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
      }
  #endif
@@ -1730,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
      UNUSED(blocklen);
  
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
          const void * b_ptr = vx;
          const void * a_ptr = vy;
          float * res_ptr = s;
@@ -2141,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
          return;
      }
      else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                      "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                      "performance");
      }
      else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                      "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                      "quantization format for optimal performance");
      }
diff --git a/src/ggml-impl.h b/src/ggml-impl.h

index 3daee492699290714d779bdd904c29a340fff37b..190af081031da434f680d7e406afa6196b20f38a 100644 (file)
--- a/src/ggml-impl.h
+++ b/src/ggml-impl.h
@@ -143,6 +143,7 @@ extern "C" {
  
  #if defined(__ARM_FEATURE_SVE)
  #include <arm_sve.h>
+#include <sys/prctl.h>
  #endif
  
  // 16-bit float
diff --git a/src/ggml-quants.c b/src/ggml-quants.c

index 16aaf523fcff9b7f941bac87ab455d5fb6ddeb69..d5b91c2dbc0c17ce3a6691d7274c2f5dd794c26b 100644 (file)
--- a/src/ggml-quants.c
+++ b/src/ggml-quants.c
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
      float sumf = 0;
  
  #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
          const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
          const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
  
@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
      float sumf = 0;
  
  #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
          svfloat32_t sumv0 = svdup_n_f32(0.0f);
          svfloat32_t sumv1 = svdup_n_f32(0.0f);
  
diff --git a/src/ggml-quants.h b/src/ggml-quants.h

index 88b1f3269646d8cb19a7531a2ef458820a0346e4..525d5ee30d8deda25ef32f480b597e0bbf86fe72 100644 (file)
--- a/src/ggml-quants.h
+++ b/src/ggml-quants.h
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
  void iq3xs_init_impl(int grid_size);
  void iq3xs_free_impl(int grid_size);
  
+#if defined(__ARM_FEATURE_SVE)
+extern int ggml_sve_cnt_b;
+#endif
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/ggml.c b/src/ggml.c

index ce73d5724a2d74eb4a2674920c98fb92842f07be..d1279bf9c55cc352c715955892391f658cb44129 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -37,6 +37,9 @@
  #include <unistd.h>
  #endif
  
+#if defined(__ARM_FEATURE_SVE)
+int ggml_sve_cnt_b = 0;
+#endif
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
  #undef GGML_USE_LLAMAFILE
  #endif
@@ -3561,6 +3564,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
  
      GGML_ASSERT_ALIGNED(ctx->mem_buffer);
  
+#if defined(__ARM_FEATURE_SVE)
+    if (!ggml_sve_cnt_b) {
+        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
+
      GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
  
      ggml_critical_section_end();
author	jdomke <redacted>
	Sat, 3 Aug 2024 16:34:41 +0000 (01:34 +0900)
committer	Georgi Gerganov <redacted>
	Thu, 8 Aug 2024 10:45:29 +0000 (13:45 +0300)
src/ggml-aarch64.c		patch \| blob \| history
src/ggml-impl.h		patch \| blob \| history
src/ggml-quants.c		patch \| blob \| history
src/ggml-quants.h		patch \| blob \| history
src/ggml.c		patch \| blob \| history