examples : fix benchmark-matmult (#1554)

author Georgi Gerganov <redacted>

Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)

committer GitHub <redacted>

Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)
author Georgi Gerganov <redacted>
Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)
committer GitHub <redacted>
Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp

index b16ad24d3414ade5c444df23ee0e67cec34fa66d..c8f7d486976d7b67d8bd61743824655d65c83008 100644 (file)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -33,11 +33,11 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
  }
  
  float tensor_sum_elements(const ggml_tensor * tensor) {
-    float sum = 0;
-    if (tensor->type==GGML_TYPE_F32) {
+    double sum = 0;
+    if (tensor->type == GGML_TYPE_F32) {
          for (int j = 0; j < tensor->ne[1]; j++) {
              for (int k = 0; k < tensor->ne[0]; k++) {
-                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
              }
          }
      }
@@ -126,12 +126,15 @@ int main(int argc, char ** argv)  {
  
      //printf("Memsize required = %i\n", sizex*sizex);
  
+    // TODO: perform the bench for all types or for a user specified type
+    const ggml_type qtype = GGML_TYPE_Q4_1;
+
      size_t ctx_size = 0;
      ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
      ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
      ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
      ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
      ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
      ctx_size += 1024*1024*16;
@@ -164,7 +167,7 @@ int main(int argc, char ** argv)  {
      struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
      ggml_set_f32(m2, 2.0f);
  
-    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
      // printf("Creating new tensor m11xm2\n");
      struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
  
@@ -182,17 +185,16 @@ int main(int argc, char ** argv)  {
  
      TENSOR_DUMP(gf.nodes[0]);
  
-    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
  
      int32_t nelements = sizex*sizey;
-    int32_t ne[2] = { sizex, sizey };
  
      std::vector<int64_t> hist_cur(1 << 4, 0);
  
      // Set up a the benchmark matrices
      // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
  
      // Set up a the compute graph
      // printf("Creating new tensor q31\n");
@@ -203,8 +205,8 @@ int main(int argc, char ** argv)  {
  
      // Set up a second graph computation to make sure we override the CPU cache lines
      // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
  
      // printf("Creating new tensor q32\n");
      struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@@ -221,7 +223,7 @@ int main(int argc, char ** argv)  {
      printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
  
  
-    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    // Let's use the F32 result from above as a reference for the quantized multiplication
      float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
  
      printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
author	Georgi Gerganov <redacted>
	Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)
committer	GitHub <redacted>
	Wed, 20 Sep 2023 07:02:39 +0000 (10:02 +0300)