cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
- cparams.mul_mat_q = params.mul_mat_q;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding;
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
- fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
bool kl_divergence = false; // compute KL-divergence
- bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
+ printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+ printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int is_pp_shared = 0;
int n_gpu_layers = 0;
- int mmq = 0;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
}
if (argc >= 6) {
- mmq = std::atoi(argv[5]);
+ n_pp = parse_list(argv[5]);
}
if (argc >= 7) {
- n_pp = parse_list(argv[6]);
+ n_tg = parse_list(argv[6]);
}
if (argc >= 8) {
- n_tg = parse_list(argv[7]);
- }
-
- if (argc >= 9) {
- n_pl = parse_list(argv[8]);
+ n_pl = parse_list(argv[7]);
}
// init LLM
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
- ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
- -mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
- std::vector<bool> mul_mat_q;
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
int reps;
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
- /* mul_mat_q */ {true},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* reps */ 5,
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
- printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
- } else if (arg == "-mmq" || arg == "--mul-mat-q") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- auto p = split<bool>(argv[i], split_delim);
- params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
- if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
- bool mul_mat_q;
std::vector<float> tensor_split;
bool use_mmap;
cparams.n_batch = n_batch;
cparams.type_k = type_k;
cparams.type_v = type_v;
- cparams.mul_mat_q = mul_mat_q;
cparams.offload_kqv = !no_kv_offload;
return cparams;
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
- for (const auto & mmq : params.mul_mat_q)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
- /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
};
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
- /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
};
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
- bool mul_mat_q;
std::vector<float> tensor_split;
bool use_mmap;
int n_prompt;
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
- mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
n_prompt = inst.n_prompt;
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
- "mul_mat_q", "tensor_split", "use_mmap",
+ "tensor_split", "use_mmap",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
- field == "mul_mat_q" || field == "use_mmap") {
+ field == "use_mmap") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
- std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
+ tensor_split_str, std::to_string(use_mmap),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
if (field == "n_threads") {
return "threads";
}
- if (field == "mul_mat_q") {
- return "mmq";
- }
if (field == "no_kv_offload") {
return "nkvo";
}
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.emplace_back("split_mode");
}
- if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
- fields.emplace_back("mul_mat_q");
- }
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload");
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--no-mul-mat-q" || arg == "-nommq")
- {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
- params.mul_mat_q = false;
-#else
- LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--main-gpu" || arg == "-mg")
float yarn_beta_slow;
float defrag_thold;
- bool mul_mat_q;
bool offload_kqv;
bool do_pooling;
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
- /*.mul_mat_q =*/ true,
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.offload_kqv =*/ true,
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
- cparams.mul_mat_q = params.mul_mat_q;
cparams.offload_kqv = params.offload_kqv;
cparams.do_pooling = params.do_pooling;
enum ggml_type type_v; // data type for V cache
// Keep the booleans together to avoid misalignment during copy-by-value.
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters",
"n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
"n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO",
- "mul_mat_q": "MMQ", "tensor_split": "Tensor split"
+ "tensor_split": "Tensor split"
}
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.