* fix indents and commands for Haiku, and add OpenBLAS detection in src/CMakeLists.txt
* add system detection and add OpenBLAS detection
* change loop number by environment variable GGML_NLOOP or command line option
* change fmadd codes on no FMA support system
* change n_threads by environment variable GGML_NTHREADS or command line option
---------
Co-authored-by: Georgi Gerganov <redacted>
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
-message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
+#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-elseif (UNAME_S MATCHES "Linux")
+ elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
if (SSE3_M MATCHES "sse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
endif()
- message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-elseif (UNAME_S MATCHES "Haiku")
+ elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
- execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX" OUTPUT_VARIABLE AVX1_M)
- if (AVX1_M MATCHES "AVX")
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
+ if (AVX1_M MATCHES "avx")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
- execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX2" OUTPUT_VARIABLE AVX2_M)
- if (AVX2_M MATCHES "AVX2")
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
+ if (AVX2_M MATCHES "avx2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
- execute_process(COMMAND bash -c "sysinfo -cpu | grep -w FMA" OUTPUT_VARIABLE FMA_M)
- if (FMA_M MATCHES "FMA")
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
+ if (FMA_M MATCHES "fma")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
- execute_process(COMMAND bash -c "sysinfo -cpu | grep -w F16C" OUTPUT_VARIABLE F16C_M)
- if (F16C_M MATCHES "F16C")
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
+ if (F16C_M MATCHES "f16c")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
- message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
endif()
endif()
+if (GGML_OPENBLAS)
+ set(OPENBLAS_INCLUDE_SEARCH_PATHS
+ /usr/include
+ /usr/include/openblas
+ /usr/include/openblas-base
+ /usr/local/include
+ /usr/local/include/openblas
+ /usr/local/include/openblas-base
+ /opt/OpenBLAS/include
+ $ENV{OpenBLAS_HOME}
+ $ENV{OpenBLAS_HOME}/include
+ )
+ find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+ find_library(OPENBLAS_LIB NAMES openblas libopenblas)
+ if (OPENBLAS_LIB)
+ message(STATUS "OpenBLAS found")
+
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
+ set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
+ set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+ else()
+ message(WARNING "OpenBLAS not found")
+ endif()
+endif()
+
if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
endif()
.
../include
../include/ggml
+ ${GGML_EXTRA_INCS}
)
if (MSVC)
+# check systems
+if (NOT UNAME_S)
+ execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
+endif()
+if (NOT UNAME_P)
+ execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
+endif()
+if (NOT UNAME_M)
+ execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
+endif()
+#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+if (UNAME_S MATCHES "Darwin")
+ if (NOT UNAME_P MATCHES "arm")
+ execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
+ if (SYSCTL_M MATCHES "1")
+ #set(UNAME_P "arm")
+ #set(UNAME_M "arm64")
+ message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea
+d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
+ endif()
+ endif()
+endif()
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+ message(STATUS "ARM detected")
+ #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mcpu=apple-m1")
+else()
+ message(STATUS "x86 detected")
+ #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
+ if (UNAME_S MATCHES "Darwin")
+ execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
+ if (AVX1_M MATCHES "AVX1.0")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+ endif()
+ execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
+ if (AVX2_M MATCHES "AVX2")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+ endif()
+ if (AVX1_M MATCHES "FMA")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+ endif()
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+ elseif (UNAME_S MATCHES "Linux")
+ message(STATUS "Linux detected")
+ execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
+ if (AVX1_M MATCHES "avx")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+ endif()
+ execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
+ if (AVX2_M MATCHES "avx2")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+ endif()
+ execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
+ if (FMA_M MATCHES "fma")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+ endif()
+ execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
+ if (F16C_M MATCHES "f16c")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+ endif()
+ execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
+ if (SSE3_M MATCHES "sse3")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -msse3")
+ endif()
+ elseif (UNAME_S MATCHES "Haiku")
+ message(STATUS "Haiku detected")
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
+ if (AVX1_M MATCHES "avx")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
+ endif()
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
+ if (AVX2_M MATCHES "avx2")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
+ endif()
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
+ if (FMA_M MATCHES "fma")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
+ endif()
+ execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
+ if (F16C_M MATCHES "f16c")
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
+ endif()
+ else()
+ set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma -mf16c -mavx -mavx2")
+ endif()
+endif()
+
# on APPLE - include Accelerate framework
if (APPLE AND NOT GGML_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
endif()
endif()
+if (GGML_OPENBLAS)
+ set(OPENBLAS_INCLUDE_SEARCH_PATHS
+ /usr/include
+ /usr/include/openblas
+ /usr/include/openblas-base
+ /usr/local/include
+ /usr/local/include/openblas
+ /usr/local/include/openblas-base
+ /opt/OpenBLAS/include
+ $ENV{OpenBLAS_HOME}
+ $ENV{OpenBLAS_HOME}/include
+ )
+ find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+ find_library(OPENBLAS_LIB NAMES openblas libopenblas)
+ if (OPENBLAS_LIB)
+ message(STATUS "OpenBLAS found")
+
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
+ set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
+ set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+ else()
+ message(WARNING "OpenBLAS not found")
+ endif()
+endif()
+
#
# test-vec0
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
- set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
+ #set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
+ set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS ${GGML_C_FLAGS})
endif()
#
set(TEST_TARGET test-mul-mat0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
#
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
#
-# test-svd0 (arm)
+# test-svd0 (arm/x86)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
set(TEST_TARGET test-svd0)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS)
+ set(TEST_TARGET test-svd0)
+ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+ target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+ add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
endif()
int ne[4];
- for (int iter = 0; iter < 1000; ++iter) {
+ // original loop: 1000
+ int niter = 1000;
+ const char *env = getenv("GGML_NLOOP");
+ if (env != NULL) {
+ niter = atoi(env);
+ }
+ if (argc > 1) {
+ niter = atoi(argv[1]);
+ }
+ for (int iter = 0; iter < niter; ++iter) {
+ printf("test-grad0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);
get_random_dims(ne, 4);
int ne[4];
- for (int iter = 0; iter < 500; ++iter) {
+ // original loop: 500
+ int niter = 500;
+ const char *env = getenv("GGML_NLOOP");
+ if (env != NULL) {
+ niter = atoi(env);
+ }
+ if (argc > 1) {
+ niter = atoi(argv[1]);
+ }
+ for (int iter = 0; iter < niter; ++iter) {
+ printf("test-mul-mat0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);
get_random_dims(ne, 4);
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
+#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+ sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+ sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+ sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+ sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
}
dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3)));
for (int j = 0; j < ncols8; j += 8) {
__m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j)));
__m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
+#if defined(__FMA__)
sum = _mm256_fmadd_ps(a, b, sum);
+#else
+ sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum);
+#endif
}
dst[i] = reduce_vector8_0(sum);
__m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
__m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
+#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+#else
+ sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+ sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1);
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
__m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16)));
__m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24)));
+#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+ sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+ sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+ sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+ sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
+#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+ sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+ sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+ sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+ sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
opt_params.adam.alpha = 0.01f;
- opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8;
+ // original threads: 8
+ int nthreads = 8;
+ const char *env = getenv("GGML_NTHREADS");
+ if (env != NULL) {
+ nthreads = atoi(env);
+ }
+ if (argc > 1) {
+ nthreads = atoi(argv[1]);
+ }
+ opt_params.n_threads = nthreads;
+ printf("test2: n_threads:%d\n", opt_params.n_threads);
const float xi[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f , 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, };
float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };