llama-bench: add `-d` depth arg (#13096)

author Vishal Agarwal <redacted>

Mon, 28 Apr 2025 14:50:39 +0000 (20:20 +0530)

committer GitHub <redacted>

Mon, 28 Apr 2025 14:50:39 +0000 (16:50 +0200)
author Vishal Agarwal <redacted>
Mon, 28 Apr 2025 14:50:39 +0000 (20:20 +0530)
committer GitHub <redacted>
Mon, 28 Apr 2025 14:50:39 +0000 (16:50 +0200)
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md

index 6bbe4bb75fbf8f0c26f7a86464fe66f3aadadef6..1f5e2f66200a65a7431f3093864dd70a10d1c9e9 100644 (file)
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -28,6 +28,7 @@ options:
    -p, --n-prompt <n>                        (default: 512)
    -n, --n-gen <n>                           (default: 128)
    -pg <pp,tg>                               (default: )
+  -d, --n-depth <n>                         (default: 0)
    -b, --batch-size <n>                      (default: 2048)
    -ub, --ubatch-size <n>                    (default: 512)
    -ctk, --cache-type-k <t>                  (default: f16)
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
  
  Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
  
+Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
+
  For a description of the other options, see the [main example](../main/README.md).
  
  Note:
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
  | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
  | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
  
+### Different prefilled context
+
+```
+$ ./llama-bench -d 0,512
+```
+
+| model                          |       size |     params | backend    | ngl |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           pp512 |      7340.20 ± 23.45 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           tg128 |        120.60 ± 0.59 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    pp512 @ d512 |      6425.91 ± 18.88 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    tg128 @ d512 |        116.71 ± 0.60 |
+
  ## Output formats
  
  By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
  ```
  
  ```csv
-build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
  ```
  
  ### JSON
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
  ```json
  [
    {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
      "n_gpu_layers": 99,
+    "split_mode": "layer",
      "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
      "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
      "n_prompt": 512,
      "n_gen": 0,
-    "test_time": "2023-09-23T12:09:57Z",
-    "avg_ns": 212365953,
-    "stddev_ns": 985423,
-    "avg_ts": 2410.974041,
-    "stddev_ts": 11.163766,
-    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
-    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:50Z",
+    "avg_ns": 72135640,
+    "stddev_ns": 1453752,
+    "avg_ts": 7100.002165,
+    "stddev_ts": 140.341520,
+    "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
+    "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
    },
    {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
      "n_gpu_layers": 99,
+    "split_mode": "layer",
      "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
      "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
      "n_prompt": 0,
      "n_gen": 128,
-    "test_time": "2023-09-23T12:09:59Z",
-    "avg_ns": 977425219,
-    "stddev_ns": 9268593,
-    "avg_ts": 130.965708,
-    "stddev_ts": 1.238924,
-    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
-    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:51Z",
+    "avg_ns": 1076767880,
+    "stddev_ns": 9449585,
+    "avg_ts": 118.881588,
+    "stddev_ts": 1.041811,
+    "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
+    "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
    }
  ]
  ```
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
  ```
  
  ```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
  ```
  
  
@@ -271,25 +301,32 @@ $ ./llama-bench -o sql
  CREATE TABLE IF NOT EXISTS test (
    build_commit TEXT,
    build_number INTEGER,
-  cuda INTEGER,
-  metal INTEGER,
-  gpu_blas INTEGER,
-  blas INTEGER,
    cpu_info TEXT,
    gpu_info TEXT,
+  backends TEXT,
    model_filename TEXT,
    model_type TEXT,
    model_size INTEGER,
    model_n_params INTEGER,
    n_batch INTEGER,
+  n_ubatch INTEGER,
    n_threads INTEGER,
-  f16_kv INTEGER,
+  cpu_mask TEXT,
+  cpu_strict INTEGER,
+  poll INTEGER,
+  type_k TEXT,
+  type_v TEXT,
    n_gpu_layers INTEGER,
+  split_mode TEXT,
    main_gpu INTEGER,
-  mul_mat_q INTEGER,
+  no_kv_offload INTEGER,
+  flash_attn INTEGER,
    tensor_split TEXT,
+  use_mmap INTEGER,
+  embeddings INTEGER,
    n_prompt INTEGER,
    n_gen INTEGER,
+  n_depth INTEGER,
    test_time TEXT,
    avg_ns INTEGER,
    stddev_ns INTEGER,
@@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test (
    stddev_ts REAL
  );
  
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
  ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp

index 564a51bfd7b6cbe28950a029f9adbbcfb69c1eac..5a78216e44fa446e1d12bc320c7f0b318a6add6f 100644 (file)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -200,6 +200,7 @@ struct cmd_params {
      std::vector<int>                 n_prompt;
      std::vector<int>                 n_gen;
      std::vector<std::pair<int, int>> n_pg;
+    std::vector<int>                 n_depth;
      std::vector<int>                 n_batch;
      std::vector<int>                 n_ubatch;
      std::vector<ggml_type>           type_k;
@@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
      /* n_prompt             */ { 512 },
      /* n_gen                */ { 128 },
      /* n_pg                 */ {},
+    /* n_depth              */ { 0 },
      /* n_batch              */ { 2048 },
      /* n_ubatch             */ { 512 },
      /* type_k               */ { GGML_TYPE_F16 },
@@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
      printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
      printf("  -pg <pp,tg>                               (default: %s)\n",
             join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -d, --n-depth <n>                         (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
      printf("  -b, --batch-size <n>                      (default: %s)\n",
             join(cmd_params_defaults.n_batch, ",").c_str());
      printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
@@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                  break;
              }
              params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
+        } else if (arg == "-d" || arg == "--n-depth") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
          } else if (arg == "-b" || arg == "--batch-size") {
              if (++i >= argc) {
                  invalid_param = true;
@@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
      if (params.n_pg.empty()) {
          params.n_pg = cmd_params_defaults.n_pg;
      }
+    if (params.n_depth.empty()) {
+        params.n_depth = cmd_params_defaults.n_depth;
+    }
      if (params.n_batch.empty()) {
          params.n_batch = cmd_params_defaults.n_batch;
      }
@@ -801,6 +814,7 @@ struct cmd_params_instance {
      std::string        model;
      int                n_prompt;
      int                n_gen;
+    int                n_depth;
      int                n_batch;
      int                n_ubatch;
      ggml_type          type_k;
@@ -880,7 +894,7 @@ struct cmd_params_instance {
      llama_context_params to_llama_cparams() const {
          llama_context_params cparams = llama_context_default_params();
  
-        cparams.n_ctx       = n_prompt + n_gen;
+        cparams.n_ctx       = n_prompt + n_gen + n_depth;
          cparams.n_batch     = n_batch;
          cparams.n_ubatch    = n_ubatch;
          cparams.type_k      = type_k;
@@ -916,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
      for (const auto & nt : params.n_threads)
      for (const auto & cm : params.cpu_mask)
      for (const auto & cs : params.cpu_strict)
+    for (const auto & nd : params.n_depth)
      for (const auto & pl : params.poll) {
          for (const auto & n_prompt : params.n_prompt) {
              if (n_prompt == 0) {
@@ -925,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .model        = */ m,
                  /* .n_prompt     = */ n_prompt,
                  /* .n_gen        = */ 0,
+                /* .n_depth      = */ nd,
                  /* .n_batch      = */ nb,
                  /* .n_ubatch     = */ nub,
                  /* .type_k       = */ tk,
@@ -955,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .model        = */ m,
                  /* .n_prompt     = */ 0,
                  /* .n_gen        = */ n_gen,
+                /* .n_depth      = */ nd,
                  /* .n_batch      = */ nb,
                  /* .n_ubatch     = */ nub,
                  /* .type_k       = */ tk,
@@ -985,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .model        = */ m,
                  /* .n_prompt     = */ n_pg.first,
                  /* .n_gen        = */ n_pg.second,
+                /* .n_depth      = */ nd,
                  /* .n_batch      = */ nb,
                  /* .n_ubatch     = */ nub,
                  /* .type_k       = */ tk,
@@ -1040,6 +1058,7 @@ struct test {
      bool                     embeddings;
      int                      n_prompt;
      int                      n_gen;
+    int                      n_depth;
      std::string              test_time;
      std::vector<uint64_t>    samples_ns;
  
@@ -1072,6 +1091,7 @@ struct test {
          embeddings     = inst.embeddings;
          n_prompt       = inst.n_prompt;
          n_gen          = inst.n_gen;
+        n_depth        = inst.n_depth;
          // RFC 3339 date-time format
          time_t t       = time(NULL);
          std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@@ -1113,9 +1133,11 @@ struct test {
              "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
              "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
              "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",   "n_prompt",     "n_gen",          "n_depth",    "test_time",    "avg_ns",
              "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "test_time",    "avg_ns",
-            "stddev_ns",    "avg_ts",       "stddev_ts",
+            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
+            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
          };
          return fields;
      }
@@ -1125,8 +1147,8 @@ struct test {
      static field_type get_field_type(const std::string & field) {
          if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
              field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
-            field == "stddev_ns") {
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
+            field == "avg_ns" || field == "stddev_ns") {
              return INT;
          }
          if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1204,6 +1226,7 @@ struct test {
                                              std::to_string(embeddings),
                                              std::to_string(n_prompt),
                                              std::to_string(n_gen),
+                                            std::to_string(n_depth),
                                              test_time,
                                              std::to_string(avg_ns()),
                                              std::to_string(stdev_ns()),
@@ -1381,7 +1404,7 @@ struct markdown_printer : public printer {
              return 4;
          }
          if (field == "test") {
-            return 13;
+            return 15;
          }
  
          int width = std::max((int) field.length(), 10);
@@ -1531,6 +1554,10 @@ struct markdown_printer : public printer {
                  } else {
                      snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                  }
+                if (t.n_depth > 0) {
+                    int len = strlen(buf);
+                    snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
+                }
                  value = buf;
              } else if (field == "t/s") {
                  snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@@ -1789,6 +1816,14 @@ int main(int argc, char ** argv) {
          for (int i = 0; i < params.reps; i++) {
              llama_kv_self_clear(ctx);
  
+            if (t.n_depth > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+            }
+
              uint64_t t_start = get_time_ns();
  
              if (t.n_prompt > 0) {
author	Vishal Agarwal <redacted>
	Mon, 28 Apr 2025 14:50:39 +0000 (20:20 +0530)
committer	GitHub <redacted>
	Mon, 28 Apr 2025 14:50:39 +0000 (16:50 +0200)
examples/llama-bench/README.md		patch \| blob \| history
examples/llama-bench/llama-bench.cpp		patch \| blob \| history