batched-bench : add `--output-format jsonl` option (#9293)

author Aarni Koskela <redacted>

Fri, 6 Sep 2024 15:59:58 +0000 (18:59 +0300)

committer GitHub <redacted>

Fri, 6 Sep 2024 15:59:58 +0000 (17:59 +0200)
author Aarni Koskela <redacted>
Fri, 6 Sep 2024 15:59:58 +0000 (18:59 +0300)
committer GitHub <redacted>
Fri, 6 Sep 2024 15:59:58 +0000 (17:59 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index d22b049681c9f7c22efb88593815ce8bc4119f17..de2a177c165b4e73bcb68475e0c08097af07cb69 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1678,6 +1678,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
          else { invalid_param = true; }
          return true;
      }
+    if (arg == "--output-format") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
+        else if (value == "md") { params.batched_bench_output_jsonl = false; }
+        else { invalid_param = true; }
+        return true;
+    }
      if (arg == "--no-warmup") {
          params.warmup = false;
          return true;
@@ -2068,6 +2076,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
      options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
  
+    options.push_back({ "batched-bench" });
+    options.push_back({ "batched-bench", "       --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
+
      printf("usage: %s [options]\n", argv[0]);
  
      for (const auto & o : options) {
diff --git a/common/common.h b/common/common.h

index cb5e7f6df10c530524f779644e670ddb6d4ea9a5..795ff44054d403435606a6455863c44da263fd95 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -275,6 +275,9 @@ struct gpt_params {
      bool spm_infill = false; // suffix/prefix/middle pattern for infill
  
      std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    // batched-bench params
+    bool batched_bench_output_jsonl = false;
  };
  
  void gpt_params_parse_from_env(gpt_params & params);
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md

index 4a07fe6bbf268c872ddd66f2983dafd88529fa42..df67c47e378cfff595cf182a62a4004cd4bd64f8 100644 (file)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -49,3 +49,12 @@ There are 2 modes of operation:
  |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
  |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
  |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
+
+### JSONL output
+
+Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
+```
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp

index 25e7c775a0095d9b0cd6100e407c692bb1cad178..25a950ea59a8ce6e5f2fcf98ceaf1f5aad833912 100644 (file)
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -122,12 +122,13 @@ int main(int argc, char ** argv) {
          }
      }
  
-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-    LOG_TEE("\n");
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    if (!params.batched_bench_output_jsonl) {
+        LOG_TEE("\n");
+        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG_TEE("\n");
+        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    }
  
      for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
          for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -195,7 +196,16 @@ int main(int argc, char ** argv) {
                  const float speed_tg = pl*tg / t_tg;
                  const float speed    = n_kv / t;
  
-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                if(params.batched_bench_output_jsonl) {
+                    LOG_TEE(
+                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
+                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
+                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
+                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
+                    );
+                } else {
+                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                }
              }
          }
      }
author	Aarni Koskela <redacted>
	Fri, 6 Sep 2024 15:59:58 +0000 (18:59 +0300)
committer	GitHub <redacted>
	Fri, 6 Sep 2024 15:59:58 +0000 (17:59 +0200)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/batched-bench/README.md		patch \| blob \| history
examples/batched-bench/batched-bench.cpp		patch \| blob \| history