-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
- -b, --batch-size <n> (default: 512)
- -ctk <t>, --cache-type-k <t> (default: f16)
- -ctv <t>, --cache-type-v <t> (default: f16)
- -t, --threads <n> (default: 112)
+ -pg <pp,tg> (default: 512,128)
+ -b, --batch-size <n> (default: 2048)
+ -ub, --ubatch-size <n> (default: 512)
+ -ctk, --cache-type-k <t> (default: f16)
+ -ctv, --cache-type-v <t> (default: f16)
+ -t, --threads <n> (default: 16)
-ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
+ -fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
- -ts, --tensor_split <ts0/ts1/..> (default: 0)
+ --numa <distribute|isolate|numactl> (default: disabled)
+ -embd, --embeddings <0|1> (default: 0)
+ -ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
-llama-bench can perform two types of tests:
+llama-bench can perform three types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`)
+- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
}
}
+static std::string pair_str(const std::pair<int, int> & p) {
+ static char buf[32];
+ snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
+ return buf;
+}
+
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
std::vector<int> n_gen;
+ std::vector<std::pair<int, int>> n_pg;
std::vector<int> n_batch;
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
+ /* n_pg */ {{512, 128}},
/* n_batch */ {2048},
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
- printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
- printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
- printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
}
auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+ } else if (arg == "-pg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split<std::string>(argv[i], ',');
+ if (p.size() != 2) {
+ invalid_param = true;
+ break;
+ }
+ params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
+ if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
};
instances.push_back(instance);
}
+
+ for (const auto & n_pg : params.n_pg) {
+ if (n_pg.first == 0 && n_pg.second == 0) {
+ continue;
+ }
+ cmd_params_instance instance = {
+ /* .model = */ m,
+ /* .n_prompt = */ n_pg.first,
+ /* .n_gen = */ n_pg.second,
+ /* .n_batch = */ nb,
+ /* .n_ubatch = */ nub,
+ /* .type_k = */ tk,
+ /* .type_v = */ tv,
+ /* .n_threads = */ nt,
+ /* .n_gpu_layers = */ nl,
+ /* .split_mode = */ sm,
+ /* .main_gpu = */ mg,
+ /* .no_kv_offload= */ nkvo,
+ /* .flash_attn = */ fa,
+ /* .tensor_split = */ ts,
+ /* .use_mmap = */ mmp,
+ /* .embeddings = */ embd,
+ };
+ instances.push_back(instance);
+ }
}
return instances;
if (field == "n_gpu_layers") {
return 3;
}
+ if (field == "test") {
+ return 13;
+ }
int width = std::max((int)field.length(), 10);
value = test::get_backend();
} else if (field == "test") {
if (t.n_prompt > 0 && t.n_gen == 0) {
- snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
+ snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) {
- snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
+ snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
} else {
- assert(false);
- exit(1);
+ snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
}
value = buf;
} else if (field == "t/s") {
llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns();
+
if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}