std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
+ ggml_numa_strategy numa;
int reps;
bool verbose;
output_formats output_format;
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
+ /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+ printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+ } else if (arg == "--numa") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ } else {
+ std::string value(argv[i]);
+ /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else { invalid_param = true; break; }
+ }
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
llama_log_set(llama_null_log_callback, NULL);
}
llama_backend_init();
+ llama_numa_init(params.numa);
// initialize printer
std::unique_ptr<printer> p;