bench : add memcpy and ggml_mul_mat benchmarks

author Georgi Gerganov <redacted>

Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)

committer Georgi Gerganov <redacted>

Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)
author Georgi Gerganov <redacted>
Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)
committer Georgi Gerganov <redacted>
Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)
diff --git a/Makefile b/Makefile

index b7edea8f5c2700a18f12ce4ca51aedfcc52762a3..56c37935b5c328ae43f66cade38224c310eef8aa 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -133,8 +133,8 @@ ifdef WHISPER_OPENBLAS
         LDFLAGS += -lopenblas
  endif
  ifdef WHISPER_GPROF
-       CFLAGS  += -pg
-       CXXFLAGS  += -pg
+       CFLAGS   += -pg
+       CXXFLAGS += -pg
  endif
  ifneq ($(filter aarch64%,$(UNAME_M)),)
  endif
diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp

index 2fd2423f5fa1517916ef32866c9c2ea761ba871a..5f99774045803722e7314a7658e72df958165441 100644 (file)
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@@ -1,12 +1,16 @@
+#include "ggml.h"
  #include "whisper.h"
  
  #include <cstdio>
+#include <cstring>
  #include <string>
  #include <thread>
+#include <vector>
  
  // command-line parameters
  struct whisper_params {
      int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
  
      std::string model = "models/ggml-base.en.bin";
  };
@@ -23,6 +27,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          }
          else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
          else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
+        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
          else {
              fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
              whisper_print_usage(argc, argv, params);
@@ -41,16 +46,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
      fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
      fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
+    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
+    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
+    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
      fprintf(stderr, "\n");
  }
  
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
+int bench_whisper_encoder(const whisper_params & params) {
      // whisper init
  
      struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -92,3 +95,150 @@ int main(int argc, char ** argv) {
  
      return 0;
  }
+
+int bench_memcpy(const whisper_params & params) {
+    size_t n    = 50;
+    size_t arr  = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
+
+    // 1 GB array
+    const size_t size = arr*1024llu*1024llu;
+
+    char * src = (char *) malloc(size);
+    char * dst = (char *) malloc(size);
+
+    for (size_t i = 0; i < size; i++) src[i] = i;
+
+    memcpy(dst, src, size); // heat-up
+
+    double tsum = 0.0;
+
+    for (size_t i = 0; i < n; i++) {
+        const int64_t t0 = ggml_time_us();
+
+        memcpy(dst, src, size);
+
+        const int64_t t1 = ggml_time_us();
+
+        tsum += (t1 - t0)*1e-6;
+
+        src[0] = rand();
+    }
+
+    fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+
+    // needed to prevent the compile from optimizing the memcpy away
+    {
+        double sum = 0.0;
+
+        for (size_t i = 0; i < size; i++) sum += dst[i];
+
+        fprintf(stderr, "sum:    %s\n", sum == -536870910.00 ? "ok" : "error");
+    }
+
+    free(src);
+    free(dst);
+
+    return 0;
+}
+
+int bench_ggml_mul_mat(const whisper_params & params) {
+    const int n_max = 128;
+
+    const std::vector<size_t> sizes = {
+        64, 128, 256, 512, 1024, 2048, 4096,
+    };
+
+    const size_t N_max = sizes.back();
+
+    // a: N*N*sizeof(float)
+    // b: N*N*sizeof(float)
+    // c: N*N*sizeof(float)
+    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+
+    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
+
+    for (int j = 0; j < (int) sizes.size(); j++) {
+        int n_fp16 = 0;
+        int n_fp32 = 0;
+
+        // GFLOPS/s
+        double s_fp16 = 0.0;
+        double s_fp32 = 0.0;
+
+        const size_t N = sizes[j];
+
+        for (int k = 0; k < 2; ++k) {
+            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+            double & s = k == 0 ? s_fp16 : s_fp32;
+            int    & n = k == 0 ? n_fp16   : n_fp32;
+
+            struct ggml_init_params gparams = {
+                /*.mem_size   =*/ buf.size(),
+                /*.mem_buffer =*/ buf.data(),
+            };
+
+            struct ggml_context * ctx0 = ggml_init(gparams);
+
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
+            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+
+            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
+
+            struct ggml_cgraph gf = ggml_build_forward(c);
+
+            gf.n_threads = params.n_threads;
+
+            double tsum = 0.0;
+
+            // heat-up
+            ggml_graph_compute(ctx0, &gf);
+
+            for (int i = 0; i < n_max; ++i) {
+                const int64_t t0 = ggml_time_us();
+
+                ggml_graph_compute(ctx0, &gf);
+
+                const int64_t t1 = ggml_time_us();
+
+                tsum += (t1 - t0)*1e-6;
+                n++;
+
+                if (tsum > 1.0 && n >= 3) {
+                    break;
+                }
+            }
+
+            ggml_free(ctx0);
+
+            s = ((2.0*N*N*N*n)/tsum)*1e-9;
+        }
+
+        fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+    }
+
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    ggml_time_init();
+
+    int ret = -1;
+
+    switch (params.what) {
+        case 0: ret = bench_whisper_encoder(params); break;
+        case 1: ret = bench_memcpy(params);          break;
+        case 2: ret = bench_ggml_mul_mat(params);    break;
+        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
+    }
+
+    return ret;
+}
diff --git a/extra/bench-all.sh b/extra/bench-all.sh

index fbdc4c218ffc2efe5542ab7e0c84408e5e1d6a33..bfb37641632c0b6bb5520b60b394d5cd8d27e6fd 100755 (executable)
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@@ -12,6 +12,18 @@ fi
  
  models=( "tiny" "base" "small" "medium" "large" )
  
+printf "\n"
+printf "Running memcpy benchmark with 1 thread\n"
+printf "\n"
+
+./bench -w 1 -t 1 2>&1
+
+printf "\n"
+printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
+printf "\n"
+
+./bench -w 2 -t $n_threads 2>&1
+
  printf "\n"
  printf "Running benchmark for all models\n"
  printf "This can take a while!\n"
@@ -56,4 +68,3 @@ for model in "${models[@]}"; do
  
      printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
  done
-
diff --git a/ggml.c b/ggml.c

index c59ee64af00c48455e49510ca1760d17e2b9169d..16f0f85997bceaac060d246c7b45b9f26e4fe4eb 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -4373,7 +4373,9 @@ static void ggml_compute_forward_mul_mat_f32(
      if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
          GGML_ASSERT(nb10 == sizeof(float));
  
-        if (params->ith != 0) return;
+        if (params->ith != 0) {
+            return;
+        }
  
          if (params->type == GGML_TASK_INIT) {
              return;
@@ -4616,7 +4618,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
      if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
          GGML_ASSERT(nb10 == sizeof(float));
  
-        if (params->ith != 0) return;
+        if (params->ith != 0) {
+            return;
+        }
  
          if (params->type == GGML_TASK_INIT) {
              return;
@@ -7054,7 +7058,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
  #ifdef __APPLE__
  
  //#include <os/lock.h>
-
+//
  //typedef os_unfair_lock ggml_lock_t;
  //
  //#define ggml_lock_init(x)    UNUSED(x)
@@ -7161,6 +7165,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
              if (state->params.ith < state->params.nth) {
                  ggml_compute_forward(&state->params, state->node);
              }
+
              state->node = NULL;
          } else {
              break;
@@ -7205,6 +7210,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                  .node   = NULL,
                  .shared = &state_shared,
              };
+
              int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
              assert(rc == 0);
              UNUSED(rc);
@@ -7273,7 +7279,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                                  node->src1->type == GGML_TYPE_F32) {
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                                  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                    node->n_tasks = 1;
+                                    node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                       //       the threads are still spinning
                                      cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
                                  } else {
                                      cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
author	Georgi Gerganov <redacted>
	Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 18 Jan 2023 18:31:46 +0000 (20:31 +0200)
Makefile		patch \| blob \| history
examples/bench/bench.cpp		patch \| blob \| history
extra/bench-all.sh		patch \| blob \| history
ggml.c		patch \| blob \| history