return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
}
-// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
-static int sve_lane_count(void) {
-#if defined(__ARM_FEATURE_SVE)
- return ggml_sve_cnt_b;
-#else
- return 0;
-#endif
-}
-
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
#if defined(__ARM_FEATURE_SVE)
- if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
+ if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
- if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
+ if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
#include <unistd.h>
#endif
-#if defined(__ARM_FEATURE_SVE)
-int ggml_sve_cnt_b = 0;
-#endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
+ int has_neon;
+ int has_i8mm;
+ int has_sve;
+ int sve_cnt;
+} ggml_arm_arch_features = {-1, -1, -1, 0};
+#endif
+
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
switch (status) {
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
////////////////////////////////////////////////////////////////////////////////
+#if defined(__ARM_ARCH)
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+static void ggml_init_arm_arch_features(void) {
+#if defined(__linux__) && defined(__aarch64__)
+ uint32_t hwcap = getauxval(AT_HWCAP);
+ uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+ ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+ ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+ ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
+
+#if defined(__ARM_FEATURE_SVE)
+ ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+ int oldp = 0;
+ size_t size = sizeof(oldp);
+ if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+ oldp = 0;
+ }
+ ggml_arm_arch_features.has_neon = oldp;
+
+ if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+ oldp = 0;
+ }
+ ggml_arm_arch_features.has_i8mm = oldp;
+
+ ggml_arm_arch_features.has_sve = 0;
+ ggml_arm_arch_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+ ggml_arm_arch_features.has_neon = 1;
+#else
+ ggml_arm_arch_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+ ggml_arm_arch_features.has_i8mm = 1;
+#else
+ ggml_arm_arch_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+ ggml_arm_arch_features.has_sve = 1;
+ ggml_arm_arch_features.sve_cnt = 16;
+#else
+ ggml_arm_arch_features.has_sve = 0;
+ ggml_arm_arch_features.sve_cnt = 0;
+#endif
+#endif
+}
+#endif
+
struct ggml_context * ggml_init(struct ggml_init_params params) {
// make this function thread safe
ggml_critical_section_start();
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
+#if defined(__ARM_ARCH)
+ ggml_init_arm_arch_features();
+#endif
+
is_first_call = false;
}
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
-#if defined(__ARM_FEATURE_SVE)
- if (!ggml_sve_cnt_b) {
- ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
- }
-#endif
-
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
ggml_critical_section_end();
}
int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
- return 1;
+#if defined(__ARM_ARCH)
+ return ggml_arm_arch_features.has_neon;
#else
return 0;
#endif
}
int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
- return 1;
+#if defined(__ARM_ARCH)
+ return ggml_arm_arch_features.has_sve;
#else
return 0;
#endif
}
int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
- return 1;
+#if defined(__ARM_ARCH)
+ return ggml_arm_arch_features.has_i8mm;
#else
return 0;
#endif
}
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__ARM_ARCH)
+ return ggml_arm_arch_features.sve_cnt;
+#else
+ return 0;
+#endif
+}
////////////////////////////////////////////////////////////////////////////////