From: Georgi Gerganov <redacted>
Date: Sat, 25 Mar 2023 14:32:48 +0000 (+0200)
Subject: tests : add test-blas0
X-Git-Tag: upstream/0.0.1642~1571
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=24e23612d2eb307338af20ea0e6c2b611165c22b;p=pkg%2Fggml%2Fsources%2Fggml

tests : add test-blas0
---

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ac7039fe..69983a0f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -182,6 +182,17 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 endif()
 
+#
+# test-blas0 (arm)
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
+    set(TEST_TARGET test-blas0)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+endif()
+
 #
 # test-mul-mat2
 
diff --git a/tests/test-blas0.c b/tests/test-blas0.c
new file mode 100644
index 00000000..22e23b23
--- /dev/null
+++ b/tests/test-blas0.c
@@ -0,0 +1,264 @@
+#include "ggml.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#include <arm_neon.h>
+
+#include <Accelerate/Accelerate.h>
+
+uint64_t get_time_us() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+//
+// naive implementation
+//
+
+void mul_mat_f32_0(
+    const float * restrict src0, // M x K
+    const float * restrict src1, // N x K (transposed)
+    float * dst,
+    int m, int n, int k) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sum = 0;
+            for (int l = 0; l < k; l++) {
+                sum += src0[i*k + l] * src1[j*k + l];
+            }
+            dst[j*m + i] = sum;
+        }
+    }
+}
+
+int main(int argc, const char ** argv) {
+    if (argc < 4) {
+        printf("Usage: %s M N K\n", argv[0]);
+        return 1;
+    }
+
+    int M = atoi(argv[1]);
+    int N = atoi(argv[2]);
+    int K = atoi(argv[3]);
+
+    srand(time(NULL));
+
+    if (M == 0) M = rand() % 1000 + 1;
+    if (N == 0) N = rand() % 1000 + 1;
+    if (K == 0) K = rand() % 1000 + 1;
+
+    printf("M = %d, N = %d, K = %d\n", M, N, K);
+
+    float * src0 = (float *)malloc(sizeof(float)*M*K);
+    float * src1 = (float *)malloc(sizeof(float)*N*K);
+    float * dst0 = (float *)malloc(sizeof(float)*M*N); // naive
+    float * dst1 = (float *)malloc(sizeof(float)*M*N); // blas
+
+    struct ggml_init_params params = {
+        .mem_size   = 2048ul*1024*1024,
+        .mem_buffer = NULL,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * s0_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, M);
+    struct ggml_tensor * s1_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, N);
+
+    struct ggml_tensor * s0_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, M);
+    struct ggml_tensor * s1_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, N);
+
+    for (int j = 0; j < M; j++) {
+        for (int i = 0; i < K; i++) {
+            //src0[j*K + i] = j;
+            src0[j*K + i] = 1e-3*(rand() % 1000);
+        }
+    }
+
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < K; i++) {
+            //src1[j*K + i] = j + 1;
+            src1[j*K + i] = 1e-3*(rand() % 1000);
+        }
+    }
+
+    // copy src0 to s0_f32
+    {
+        float       * p_f32 = s0_f32->data;
+        ggml_fp16_t * p_f16 = s0_f16->data;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < K; j++) {
+                p_f32[i*K + j] = src0[i*K + j];
+                p_f16[i*K + j] = ggml_fp32_to_fp16(src0[i*K + j]);
+            }
+        }
+    }
+
+    // copy src1 to s1_f32
+    {
+        float       * p_f32 = s1_f32->data;
+        ggml_fp16_t * p_f16 = s1_f16->data;
+        for (int i = 0; i < N; i++) {
+            for (int j = 0; j < K; j++) {
+                p_f32[i*K + j] = src1[i*K + j];
+                p_f16[i*K + j] = ggml_fp32_to_fp16(src1[i*K + j]);
+            }
+        }
+    }
+
+    const clock_t start = clock();
+    const uint64_t start_us = get_time_us();
+
+    double iM = 1.0/M;
+    mul_mat_f32_0(src0, src1, dst0, M, N, K);
+
+    // Use BLAS sgemm from Accelerate framework
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, M, K, 1.0f, src1, K, src0, K, 0.0f, dst1, M);
+
+    struct ggml_tensor * dst2 = NULL;
+    struct ggml_tensor * dst3 = NULL;
+
+    {
+        dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32);
+
+        struct ggml_cgraph gf = ggml_build_forward(dst2);
+        ggml_graph_compute(ctx0, &gf);
+    }
+
+    {
+        dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32);
+
+        struct ggml_cgraph gf = ggml_build_forward(dst3);
+        ggml_graph_compute(ctx0, &gf);
+    }
+
+    bool ok_blas = true;
+    bool ok_ggml_f32 = true;
+    bool ok_ggml_f16 = true;
+
+    // check BLAS
+    for (int i = 0; i < M*N; i++) {
+        if (fabs(dst0[i] - dst1[i])/fabs(dst0[i]) > 0.0001) {
+            printf("dst0[%d] = %f, dst1[%d] = %f\n", i, dst0[i], i, dst1[i]);
+            ok_blas = false;
+        }
+    }
+
+    // check ggml (f32)
+    {
+        float * p = dst2->data;
+        for (int i = 0; i < M*N; i++) {
+            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.0001) {
+                printf("dst0[%d] = %f, dst2[%d] = %f\n", i, dst0[i], i, p[i]);
+                ok_ggml_f32 = false;
+            }
+        }
+    }
+
+    // check ggml (f16)
+    {
+        float * p = dst3->data;
+        for (int i = 0; i < M*N; i++) {
+            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.01) {
+                printf("dst0[%d] = %f, dst3[%d] = %f\n", i, dst0[i], i, p[i]);
+                ok_ggml_f16 = false;
+            }
+        }
+    }
+
+    {
+        const clock_t end = clock();
+        const uint64_t end_us = get_time_us();
+        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
+    }
+
+#if 0
+    // print src0
+    printf("src0:\n");
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src0[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    // print src1
+    printf("src1:\n");
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src1[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst0 (naive):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", dst0[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst1 (BLAS):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", dst1[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst2 (ggml f32):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", ((float *)dst2->data)[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst3 (ggml f16):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", ((float *)dst3->data)[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+#endif
+
+    free(src0);
+    free(src1);
+    free(dst0);
+    free(dst1);
+
+    ggml_free(ctx0);
+
+    printf("ok_blas = %d\n", ok_blas);
+    if (!ok_blas) {
+        printf("ERROR: BLAS failed\n");
+    }
+
+    printf("ok_ggml_f32 = %d\n", ok_ggml_f32);
+    if (!ok_ggml_f32) {
+        printf("ERROR: ggml failed\n");
+    }
+
+    printf("ok_ggml_f16 = %d\n", ok_ggml_f16);
+    if (!ok_ggml_f16) {
+        printf("ERROR: ggml failed\n");
+    }
+
+    return (ok_blas && ok_ggml_f32 && ok_ggml_f16) ? 0 : 1;
+}