#include <algorithm>
#include <cassert>
+#include <cinttypes>
#include <cmath>
+#include <codecvt>
+#include <cstdarg>
#include <cstring>
#include <ctime>
#include <fstream>
-#include <iterator>
#include <iostream>
+#include <iterator>
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
-#include <cinttypes>
-#include <codecvt>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
}
}
- if (params.prompt_cache_all &&
- (params.interactive || params.interactive_first ||
- params.instruct)) {
-
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
- bool result = true;
+ const auto params_org = params; // the example can modify the default params
+
try {
- if (!gpt_params_parse_ex(argc, argv, params)) {
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(0);
+ if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+ params = params_org;
+ params.usage = true;
+ return false;
}
- }
- catch (const std::invalid_argument & ex) {
+ } catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(1);
+ return false;
}
- return result;
+
+ return true;
}
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+ const char split_delim = ',';
+
llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
invalid_param = true;
return true;
}
- // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
+ // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
params.escape = true;
return true;
}
+ if (arg == "--no-escape") {
+ params.escape = false;
+ return true;
+ }
if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
}
return true;
}
- if (arg == "-n" || arg == "--n-predict") {
+ if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
return true;
params.interactive = true;
return true;
}
- if (arg == "--interactive-specials") {
- params.interactive_specials = true;
- return true;
- }
- if (arg == "--special") {
+ if (arg == "-sp" || arg == "--special") {
params.special = true;
return true;
}
- if (arg == "--embedding") {
+ if (arg == "--embedding" || arg == "--embeddings") {
params.embedding = true;
return true;
}
- if (arg == "--interactive-first") {
+ if (arg == "-if" || arg == "--interactive-first") {
params.interactive_first = true;
return true;
}
- if (arg == "-ins" || arg == "--instruct") {
- params.instruct = true;
- return true;
- }
if (arg == "-cnv" || arg == "--conversation") {
params.conversation = true;
return true;
}
- if (arg == "-cml" || arg == "--chatml") {
- params.chatml = true;
- return true;
- }
if (arg == "--infill") {
params.infill = true;
return true;
params.flash_attn = true;
return true;
}
- if (arg == "--color") {
+ if (arg == "-co" || arg == "--color") {
params.use_color = true;
return true;
}
params.use_mlock = true;
return true;
}
- if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
}
- if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
else { invalid_param = true; }
return true;
}
+ if (arg == "-v" || arg == "--verbose") {
+ params.verbose = true;
+ return true;
+ }
if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
return true;
params.ppl_stride = std::stoi(argv[i]);
return true;
}
- if (arg == "-ptc" || arg == "--print-token-count") {
+ if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
return true;
}
- params.n_print = std::stoi(argv[i]);
- return true;
- }
- if (arg == "--check-tensors") {
- params.check_tensors = true;
+ params.ppl_output_type = std::stoi(argv[i]);
return true;
}
- if (arg == "--ppl-output-type") {
+ if (arg == "-ptc" || arg == "--print-token-count") {
if (++i >= argc) {
invalid_param = true;
return true;
}
- params.ppl_output_type = std::stoi(argv[i]);
+ params.n_print = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--check-tensors") {
+ params.check_tensors = true;
return true;
}
if (arg == "--hellaswag") {
}
return true;
}
- if (arg == "-h" || arg == "--help") {
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(0);
+ if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
+ params.usage = true;
+ return true;
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
- if (arg == "--random-prompt") {
- params.random_prompt = true;
- return true;
- }
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
return true;
}
return true;
}
+ if (arg == "--host") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hostname = argv[i];
+ return true;
+ }
+ if (arg == "--port") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.port = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.public_path = argv[i];
+ return true;
+ }
+ if (arg == "--api-key") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.api_keys.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--api-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream key_file(argv[i]);
+ if (!key_file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string key;
+ while (std::getline(key_file, key)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
+ key_file.close();
+ return true;
+ }
+ if (arg == "--ssl-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_key = argv[i];
+ return true;
+ }
+ if (arg == "--ssl-cert-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_cert = argv[i];
+ return true;
+ }
+ if (arg == "--timeout" || arg == "-to") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.timeout_read = std::stoi(argv[i]);
+ params.timeout_write = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-spf" || arg == "--system-prompt-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string system_prompt;
+ std::copy(
+ std::istreambuf_iterator<char>(file),
+ std::istreambuf_iterator<char>(),
+ std::back_inserter(system_prompt)
+ );
+ params.system_prompt = system_prompt;
+ return true;
+ }
+ if (arg == "--log-format") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (std::strcmp(argv[i], "json") == 0) {
+ params.log_json = true;
+ } else if (std::strcmp(argv[i], "text") == 0) {
+ params.log_json = false;
+ } else {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ if (arg == "--no-slots") {
+ params.endpoint_slots = false;
+ return true;
+ }
+ if (arg == "--metrics") {
+ params.endpoint_metrics = true;
+ return true;
+ }
+ if (arg == "--slot-save-path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.slot_save_path = argv[i];
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.slot_save_path += DIRECTORY_SEPARATOR;
+ }
+ return true;
+ }
+ if (arg == "--chat-template") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (!llama_chat_verify_template(argv[i])) {
+ fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
+ fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
+ invalid_param = true;
+ return true;
+ }
+ params.chat_template = argv[i];
+ return true;
+ }
+ if (arg == "-pps") {
+ params.is_pp_shared = true;
+ return true;
+ }
+ if (arg == "-npp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split<int>(argv[i], split_delim);
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-ntg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split<int>(argv[i], split_delim);
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-npl") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split<int>(argv[i], split_delim);
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "--context-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.context_files.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_size = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-separator") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_separator = argv[i];
+ return true;
+ }
+ if (arg == "--junk") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_junk = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--pos") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.i_pos = std::stoi(argv[i]);
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
return false;
}
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams;
}
sampler_type_names.pop_back();
- printf("\n");
- printf("usage: %s [options]\n", argv[0]);
- printf("\n");
- printf("options:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" --version show version and build info\n");
- printf(" -i, --interactive run in interactive mode\n");
- printf(" --special special tokens output enabled\n");
- printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
- printf(" --interactive-first run in interactive mode and wait for input right away\n");
- printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
- printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
- printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
- printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
- printf(" -r PROMPT, --reverse-prompt PROMPT\n");
- printf(" halt generation at PROMPT, return control in interactive mode\n");
- printf(" (can be specified more than once for multiple prompts).\n");
- printf(" --color colorise output to distinguish prompt and user input from generations\n");
- printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
- printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
- printf(" -tb N, --threads-batch N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
- printf(" -td N, --threads-draft N");
- printf(" number of threads to use during generation (default: same as --threads)\n");
- printf(" -tbd N, --threads-batch-draft N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
- printf(" -p PROMPT, --prompt PROMPT\n");
- printf(" prompt to start generation with (default: empty)\n");
- printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
- printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
- printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
- printf(" not supported with --interactive or other interactive options\n");
- printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
- printf(" --random-prompt start with a randomized prompt.\n");
- printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
- printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
- printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
- printf(" -f FNAME, --file FNAME\n");
- printf(" prompt file to start generation.\n");
- printf(" -bf FNAME, --binary-file FNAME\n");
- printf(" binary file containing multiple choice tasks.\n");
- printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
- printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
- printf(" -ub N, --ubatch-size N\n");
- printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
- printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
- printf(" (default: %s)\n", sampler_type_names.c_str());
- printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
- printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
- printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
- printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
- printf(" --mirostat N use Mirostat sampling.\n");
- printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
- printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
- printf(" modifies the likelihood of token appearing in the completion,\n");
- printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
- printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
- printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
- printf(" --grammar-file FNAME file to read grammar from\n");
- printf(" -j SCHEMA, --json-schema SCHEMA\n");
- printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
- printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
- printf(" --cfg-negative-prompt PROMPT\n");
- printf(" negative prompt to use for guidance. (default: empty)\n");
- printf(" --cfg-negative-prompt-file FNAME\n");
- printf(" negative prompt file to use for guidance. (default: empty)\n");
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
- printf(" --rope-scaling {none,linear,yarn}\n");
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
- printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
- printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
- printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls}\n");
- printf(" pooling type for embeddings, use model default if unspecified\n");
- printf(" -dt N, --defrag-thold N\n");
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
- printf(" --penalize-nl penalize newline tokens\n");
- printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
- printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
- printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
- printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
- printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
- printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
- printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
- printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
- printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
- printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
- printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
- printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
- printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
- printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
- printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
- printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
+ struct option_info {
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
+ option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
+ va_list args_list;
+ va_start(args_list, desc);
+ char buffer[1024];
+ vsnprintf(buffer, sizeof(buffer), desc, args_list);
+ va_end(args_list);
+ this->desc = buffer;
+ }
+
+ option_info(const std::string & grp) : grp(grp) {}
+
+ std::string tags;
+ std::string args;
+ std::string desc;
+ std::string grp;
+ };
+
+ std::vector<option_info> options;
+
+ // TODO: filter by tags
+
+ options.push_back({ "general" });
+ options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
+ options.push_back({ "*", " --version", "show version and build info" });
+ options.push_back({ "*", "-v, --verbose", "print verbose information" });
+ options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
+ options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
+ options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
+ options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
+ options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
+ options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+ options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
+ options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
+ options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
+ "path to static lookup cache to use for lookup decoding (not updated by generation)" });
+ options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
+
+ options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+ options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+ options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
+ options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
+ options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
+ options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
+ options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+ options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
+ options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
+ options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
+ options.push_back({ "*", " --no-escape", "do not process escape sequences" });
+ options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
+ options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
+ options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
+ "not supported with --interactive or other interactive options" });
+ options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
+ options.push_back({ "main", "-r, --reverse-prompt PROMPT",
+ "halt generation at PROMPT, return control in interactive mode\n"
+ "can be specified more than once for multiple prompts" });
+ options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
+ options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
+ options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
+ options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
+ options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
+ options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+
+ options.push_back({ "sampling" });
+ options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
+ "(default: %s)", sampler_type_names.c_str() });
+ options.push_back({ "*", " --sampling-seq SEQUENCE",
+ "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
+ options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
+ options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
+ options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
+ options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
+ options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+ options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+ options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+ options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+ options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
+ options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
+ options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
+ options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
+ options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
+ options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
+ options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
+ "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
+ options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
+ options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
+ options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
+ options.push_back({ "main", " --cfg-negative-prompt PROMPT",
+ "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
+ options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
+ "negative prompt file to use for guidance" });
+ options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
+
+ options.push_back({ "grammar" });
+ options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
+ options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
+ options.push_back({ "*", "-j, --json-schema SCHEMA",
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
+ "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
+
+ options.push_back({ "embedding" });
+ options.push_back({ "embedding", " --pooling {none,mean,cls}",
+ "pooling type for embeddings, use model default if unspecified" });
+
+ options.push_back({ "context hacking" });
+ options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
+ "RoPE frequency scaling method, defaults to linear unless specified by the model" });
+ options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
+ options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
+ options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
+ options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
+ options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
+ options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
+ options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
+ options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
+ options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
+ options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
+ options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
+ options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
+ options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
+ options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+
+ options.push_back({ "perplexity" });
+ options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
+ options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
+ options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
+ options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --multiple-choice-tasks N",
+ "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
+ options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
+ options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
+ options.push_back({ "perplexity", " --ppl-output-type {0,1}",
+ "output type for perplexity calculation (default: %d)", params.ppl_output_type });
+
+ options.push_back({ "parallel" });
+ options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
+ options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
+ options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
+ options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+
+ options.push_back({ "multi-modality" });
+ options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
+ options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
+
+ options.push_back({ "backend" });
+ options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
if (llama_supports_mlock()) {
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
}
if (llama_supports_mmap()) {
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
- }
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
- printf(" - distribute: spread execution evenly over all nodes\n");
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
- printf(" - numactl: use the CPU map provided by numactl\n");
- printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
- printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+ options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
+ }
+ options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
+ " - distribute: spread execution evenly over all nodes\n"
+ " - isolate: only spawn threads on CPUs on the node that execution started on\n"
+ " - numactl: use the CPU map provided by numactl\n"
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
+ "see https://github.com/ggerganov/llama.cpp/issues/1437" });
+
if (llama_supports_gpu_offload()) {
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -ngld N, --n-gpu-layers-draft N\n");
- printf(" number of layers to store in VRAM for the draft model\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT, --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
- }
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
- printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
- printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
- printf(" -gan N, --grp-attn-n N\n");
- printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
- printf(" -gaw N, --grp-attn-w N\n");
- printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
- printf(" -dkvc, --dump-kv-cache\n");
- printf(" verbose print of the KV cache\n");
- printf(" -nkvo, --no-kv-offload\n");
- printf(" disable KV offload\n");
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
- printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
- printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
- printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
- printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
- printf(" --control-vector FNAME\n");
- printf(" add a control vector\n");
- printf(" --control-vector-scaled FNAME S\n");
- printf(" add a control vector with user defined scaling S\n");
- printf(" --control-vector-layer-range START END\n");
- printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
- printf(" -m FNAME, --model FNAME\n");
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
- printf(" -md FNAME, --model-draft FNAME\n");
- printf(" draft model for speculative decoding (default: unused)\n");
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: unused)\n");
- printf(" -hfr REPO, --hf-repo REPO\n");
- printf(" Hugging Face model repository (default: unused)\n");
- printf(" -hff FILE, --hf-file FILE\n");
- printf(" Hugging Face model file (default: unused)\n");
- printf(" -ld LOGDIR, --logdir LOGDIR\n");
- printf(" path under which to save YAML logs (no logging if unset)\n");
- printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
- printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
- printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
- printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
- printf(" --override-kv KEY=TYPE:VALUE\n");
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -ptc N, --print-token-count N\n");
- printf(" print token count every N tokens (default: %d)\n", params.n_print);
- printf(" --check-tensors check model tensor data for invalid values\n");
- printf("\n");
+ options.push_back({ "*", "-ngl, --gpu-layers N",
+ "number of layers to store in VRAM" });
+ options.push_back({ "*", "-ngld, --gpu-layers-draft N",
+ "number of layers to store in VRAM for the draft model" });
+ options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
+ "how to split the model across multiple GPUs, one of:\n"
+ " - none: use one GPU only\n"
+ " - layer (default): split layers and KV across GPUs\n"
+ " - row: split rows across GPUs" });
+ options.push_back({ "*", "-ts, --tensor-split SPLIT",
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+ options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
+ "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
+ }
+
+ options.push_back({ "model" });
+ options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
+ options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
+ "advanced option to override model metadata by key. may be specified multiple times.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
+ options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
+ "add a control vector with user defined scaling SCALE" });
+ options.push_back({ "*", " --control-vector-layer-range START END",
+ "layer range to apply the control vector(s) to, start and end inclusive" });
+ options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
+ "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+ options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
+ options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
+ options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
+ options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
+
+ options.push_back({ "retrieval" });
+ options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
+ options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
+ options.push_back({ "retrieval", " --chunk-separator STRING",
+ "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
+
+ options.push_back({ "passkey" });
+ options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
+ options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
+
+ options.push_back({ "bench" });
+ options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
+ options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
+ options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
+ options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
+
+ options.push_back({ "server" });
+ options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
+ options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
+ options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
+ options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+ options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
+ options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
+ options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
+ options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
+ options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
+ options.push_back({ "server", " --system-prompt-file FNAME",
+ "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
+ options.push_back({ "server", " --log-format {text,json}",
+ "log output format: json or text (default: json)" });
+ options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
+ options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
+ options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
+ options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "only commonly used templates are accepted:\n"
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
+
#ifndef LOG_DISABLE_LOGS
- log_print_usage();
+ options.push_back({ "logging" });
+ options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
+ options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
+ options.push_back({ "logging", " --log-test", "Run simple logging test" });
+ options.push_back({ "logging", " --log-disable", "Disable trace logs" });
+ options.push_back({ "logging", " --log-enable", "Enable trace logs" });
+ options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
+ options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
+ "Each log file will have unique name: \"<name>.<ID>.log\"" });
+ options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
#endif // LOG_DISABLE_LOGS
+
+ printf("usage: %s [options]\n", argv[0]);
+
+ for (const auto & o : options) {
+ if (!o.grp.empty()) {
+ printf("\n%s:\n\n", o.grp.c_str());
+ continue;
+ }
+ printf(" %-32s", o.args.c_str());
+ if (o.args.length() > 30) {
+ printf("\n%34s", "");
+ }
+
+ const auto desc = o.desc;
+ size_t start = 0;
+ size_t end = desc.find('\n');
+ while (end != std::string::npos) {
+ printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
+ start = end + 1;
+ end = desc.find('\n', start);
+ }
+
+ printf("%s\n", desc.substr(start).c_str());
+ }
+ printf("\n");
}
std::string gpt_params_get_system_info(const gpt_params & params) {
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
}
-std::string string_random_prompt(std::mt19937 & rng) {
- const int r = rng() % 10;
- switch (r) {
- case 0: return "So";
- case 1: return "Once upon a time";
- case 2: return "When";
- case 3: return "The";
- case 4: return "After";
- case 5: return "If";
- case 6: return "import";
- case 7: return "He";
- case 8: return "She";
- case 9: return "They";
- }
-
- GGML_UNREACHABLE();
-}
-
void string_process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
+bool llama_chat_verify_template(const std::string & tmpl) {
+ llama_chat_message chat[] = {{"user", "test"}};
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+ return res >= 0;
+}
+
//
// KV cache utils
//
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
- fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
fprintf(stream, "reverse_prompt:\n");
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 512; // context size
+ int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// // sampling parameters
struct llama_sampling_params sparams;
- std::string model = ""; // model path
- std::string model_draft = ""; // draft model for speculative decoding
+ std::string model = ""; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
- std::string model_url = ""; // model url to download
- std::string hf_repo = ""; // HF repo
- std::string hf_file = ""; // HF file
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
std::string prompt = "";
- std::string prompt_file = ""; // store the external prompt file name
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
- std::string input_prefix = ""; // string to prefix user inputs with
- std::string input_suffix = ""; // string to suffix user inputs with
- std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
- std::string logdir = ""; // directory in which to save YAML log files
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
+ std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
- std::string logits_file = ""; // file for saving *all* logits
+ std::string logits_file = ""; // file for saving *all* logits
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
// TODO: avoid tuple, use struct
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
- int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
- int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
bool kl_divergence = false; // compute KL divergence
- bool random_prompt = false; // do not randomize prompt if none provided
+ bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
- bool interactive = false; // interactive mode
- bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
+ bool interactive = false; // interactive mode
+ bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
- bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
- bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
- bool interactive_first = false; // wait for user input immediately
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
- bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
+ bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+
+ // server params
+ int32_t port = 8080;
+ int32_t timeout_read = 600;
+ int32_t timeout_write = timeout_read;
+ int32_t n_threads_http = -1;
+
+ std::string hostname = "127.0.0.1";
+ std::string public_path = "";
+ std::string chat_template = "";
+ std::string system_prompt = "";
+
+ std::vector<std::string> api_keys;
+
+ std::string ssl_file_key = "";
+ std::string ssl_file_cert = "";
+
+ bool endpoint_slots = true;
+ bool endpoint_metrics = false;
+
+ bool log_json = false;
+
+ std::string slot_save_path;
+
+ // batched-bench params
+ bool is_pp_shared = false;
+
+ std::vector<int32_t> n_pp;
+ std::vector<int32_t> n_tg;
+ std::vector<int32_t> n_pl;
+
+ // retrieval params
+ std::vector<std::string> context_files; // context files to embed
+
+ int32_t chunk_size = 64; // chunk size for context embedding
+
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+ // passkey params
+ int32_t n_junk = 250; // number of times to repeat the junk text
+ int32_t i_pos = -1; // position of the passkey in the junk text
};
void gpt_params_handle_model_default(gpt_params & params);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
-std::string string_random_prompt(std::mt19937 & rng);
+
+template<class T>
+static std::vector<T> string_split(const std::string & str, char delim) {
+ std::vector<T> values;
+ std::istringstream str_stream(str);
+ std::string token;
+ while (std::getline(str_stream, token, delim)) {
+ T value;
+ std::istringstream token_stream(token);
+ token_stream >> value;
+ values.push_back(value);
+ }
+ return values;
+}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
+//
+// Chat template utils
+//
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl);
+
//
// KV cache utils
//
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results
return ret;
}
-int main(int argc, char ** argv) {
- gpt_params params;
-
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
- printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
- return 1 ;
- }
-
- int n_kv_max = 2048;
- int n_batch = 2048;
- int n_ubatch = 512;
- bool flash_attn = false;
- int is_pp_shared = 0;
- int n_gpu_layers = 0;
-
- std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
- std::vector<int> n_tg = { 128, 256, };
- std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
- //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_kv_max = std::atoi(argv[2]);
- }
-
- if (argc >= 4) {
- n_batch = std::atoi(argv[3]);
- }
-
- if (argc >= 5) {
- n_ubatch = std::atoi(argv[4]);
- }
-
- if (argc >= 6) {
- flash_attn = std::atoi(argv[5]);
- }
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
- if (argc >= 7) {
- is_pp_shared = std::atoi(argv[6]);
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+ LOG_TEE("\n");
+}
- if (argc >= 8) {
- n_gpu_layers = std::atoi(argv[7]);
- }
+int main(int argc, char ** argv) {
+ gpt_params params;
- if (argc >= 9) {
- n_pp = parse_list(argv[8]);
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- if (argc >= 10) {
- n_tg = parse_list(argv[9]);
- }
+ int is_pp_shared = params.is_pp_shared;
- if (argc >= 11) {
- n_pl = parse_list(argv[10]);
- }
+ std::vector<int> n_pp = params.n_pp;
+ std::vector<int> n_tg = params.n_tg;
+ std::vector<int> n_pl = params.n_pl;
// init LLM
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- const std::vector<float> t_split(llama_max_devices(), 0.0f);
-
- model_params.n_gpu_layers = n_gpu_layers;
- model_params.tensor_split = t_split.data();
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
return 1;
}
- llama_context_params ctx_params = llama_context_default_params();
-
- ctx_params.seed = 1234;
- ctx_params.n_ctx = n_kv_max;
- ctx_params.n_batch = n_batch;
- ctx_params.n_ubatch = n_ubatch;
- ctx_params.flash_attn = flash_attn;
-
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
return 1;
}
+ const int32_t n_kv_max = llama_n_ctx(ctx);
+
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
The example demonstrates batched generation from a given prompt
```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...
#include <string>
#include <vector>
-int main(int argc, char ** argv) {
- gpt_params params;
-
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
- return 1 ;
- }
-
- // number of parallel batches
- int n_parallel = 1;
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
- // total length of the sequences including the prompt
- int n_len = 32;
-
- // number of layers to offload to the GPU
- int n_gpu_layers = 0;
-
- if (argc >= 2) {
- params.model = argv[1];
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+ LOG_TEE("\n");
+}
- if (argc >= 3) {
- params.prompt = argv[2];
- }
+int main(int argc, char ** argv) {
+ gpt_params params;
- if (argc >= 4) {
- n_parallel = std::atoi(argv[3]);
- }
+ params.prompt = "Hello my name is";
+ params.n_predict = 32;
- if (argc >= 5) {
- n_len = std::atoi(argv[4]);
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- if (argc >= 6) {
- n_gpu_layers = std::atoi(argv[5]);
- }
- if (params.prompt.empty()) {
- params.prompt = "Hello my name is";
- }
+ // number of parallel batches
+ int n_parallel = params.n_parallel;
- string_process_escapes(params.prompt);
+ // total length of the sequences including the prompt
+ int n_predict = 32;
// init LLM
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = n_gpu_layers;
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
- ctx_params.n_batch = std::max(n_len, n_parallel);
- ctx_params.n_seq_max = n_parallel;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
return 1;
}
- const int n_ctx = llama_n_ctx(ctx);
+ const int n_ctx = llama_n_ctx(ctx);
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
const auto t_main_start = ggml_time_us();
- while (n_cur <= n_len) {
+ while (n_cur <= n_predict) {
// prepare the next batch
llama_batch_clear(batch);
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
}
int main(int argc, char ** argv) {
-
callback_data cb_data;
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
echo
# 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
echo
# 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
echo
# 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
#echo
# 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
echo
# 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo
int main(int argc, char * argv[]) {
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
}
int main(int argc, char ** argv) {
-
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
gpt_params params;
params.n_batch = 512;
- if (!gpt_params_parse(args.size(), args.data(), params)) {
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
- if (params.instruct) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for instruct mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (params.chatml) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for chatml mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.antiprompt.empty()) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
return 0;
}
- if (params.random_prompt) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.path_prompt_cache.empty()) {
- printf("\n************\n");
- printf("%s: infill does not support prompt caching\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
llama_model * model;
llama_context * ctx;
- llama_context * ctx_guidance = NULL;
+
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (sparams.cfg_scale > 1.f) {
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
- ctx_guidance = llama_new_context_with_model(model, lparams);
- }
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
- // Tokenize negative prompt
- std::vector<llama_token> guidance_inp;
- int guidance_offset = 0;
- int original_prompt_len = 0;
- if (ctx_guidance) {
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
- std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
- original_prompt_len = original_inp.size();
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
- }
-
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
- if (ctx_guidance) {
- LOG_TEE("\n");
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
- }
- }
-
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
is_interacting = params.interactive_first;
}
- bool input_echo = true;
+ bool input_echo = true;
- int n_past = 0;
- int n_remain = params.n_predict;
- int n_consumed = 0;
- int n_past_guidance = 0;
+ int n_past = 0;
+ int n_remain = params.n_predict;
+ int n_consumed = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
console::set_display(console::prompt);
std::vector<llama_token> embd;
- std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
- if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+ if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
n_past -= n_discard;
- if (ctx_guidance) {
- n_past_guidance -= n_discard;
- }
-
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+ LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
-
- if (ctx_guidance) {
- int input_size = 0;
- llama_token * input_buf = NULL;
-
- if (n_past_guidance < (int) guidance_inp.size()) {
- // Guidance context should have the same data with these modifications:
- //
- // * Replace the initial prompt
- // * Shift everything by guidance_offset
- embd_guidance = guidance_inp;
- if (embd.begin() + original_prompt_len < embd.end()) {
- embd_guidance.insert(
- embd_guidance.end(),
- embd.begin() + original_prompt_len,
- embd.end()
- );
- }
-
- input_buf = embd_guidance.data();
- input_size = embd_guidance.size();
-
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
- } else {
- input_buf = embd.data();
- input_size = embd.size();
- }
-
- for (int i = 0; i < input_size; i += params.n_batch) {
- int n_eval = std::min(input_size - i, params.n_batch);
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
- return 1;
- }
-
- n_past_guidance += n_eval;
- }
- }
-
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
}
embd.clear();
- embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+ const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
-
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd.clear();
- embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
- if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
return str.str();
}
-template<class T>
-static std::vector<T> split(const std::string & str, char delim) {
- std::vector<T> values;
- std::istringstream str_stream(str);
- std::string token;
- while (std::getline(str_stream, token, delim)) {
- T value;
- std::istringstream token_stream(token);
- token_stream >> value;
- values.push_back(value);
- }
- return values;
-}
-
template<typename T, typename F>
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
std::vector<std::string> str_values;
invalid_param = true;
break;
}
- auto p = split<std::string>(argv[i], split_delim);
+ auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<std::string>(argv[i], ',');
+ auto p = string_split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<std::string>(argv[i], split_delim);
+ auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
invalid_param = true;
break;
}
- auto p = split<std::string>(argv[i], split_delim);
+ auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<int>(argv[i], split_delim);
+ auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<std::string>(argv[i], split_delim);
+ auto p = string_split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
llama_split_mode mode;
invalid_param = true;
break;
}
- params.main_gpu = split<int>(argv[i], split_delim);
+ params.main_gpu = string_split<int>(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<bool>(argv[i], split_delim);
+ auto p = string_split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<bool>(argv[i], split_delim);
+ auto p = string_split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<bool>(argv[i], split_delim);
+ auto p = string_split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split<bool>(argv[i], split_delim);
+ auto p = string_split<bool>(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
- for (auto ts : split<std::string>(argv[i], split_delim)) {
+ for (auto ts : string_split<std::string>(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
struct llama_model * model = NULL;
};
-static void show_additional_info(int /*argc*/, char ** argv) {
- LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
- LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\n example usage:\n");
+ LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+ LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
- show_additional_info(argc, argv);
+ print_usage(argc, argv, params);
return 1;
}
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
- gpt_params_print_usage(argc, argv, params);
- show_additional_info(argc, argv);
+ print_usage(argc, argv, {});
return 1;
}
auto model = llava_init(¶ms);
int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
#### Unix-based systems (Linux, macOS, etc.):
```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
```
## Common Options
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-- `--random-prompt`: Start with a randomized prompt.
## Interaction
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
llama_sampling_params & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
std::vector<llama_token> embd_inp;
- if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
+ if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
- if (params.chatml) {
- params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
- }
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else {
LOG("use session tokens\n");
}
// number of tokens to keep when resetting context
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
} else {
params.n_keep += add_bos; // always keep the BOS token
}
- // prefix & suffix for instruct mode
- const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
- const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
-
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
- // chatml prefix & suffix
- const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
- const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
-
- LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
- LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
-
- // in instruct mode, we inject a prefix and a suffix to each input by the user
- if (params.instruct) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("### Instruction:\n\n");
- }
- // similar for chatml mode
- else if (params.chatml) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("<|im_start|>user\n");
- }
- else if (params.conversation) {
+ if (params.conversation) {
params.interactive_first = true;
}
is_interacting = true;
printf("\n");
- } else if (params.instruct || params.chatml) {
- is_interacting = true;
}
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
- if (params.conversation || params.instruct || params.chatml) {
+ if (params.conversation) {
printf("\n> ");
}
const size_t original_size = embd_inp.size();
- // instruct mode: insert instruction prefix
- if (params.instruct && !is_antiprompt) {
- LOG("inserting instruction prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
- }
- // chatml mode: insert user chat prefix
- if (params.chatml && !is_antiprompt) {
- LOG("inserting chatml prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
- }
if (params.escape) {
string_process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
- // instruct mode: insert response suffix
- if (params.instruct) {
- LOG("inserting instruction suffix\n");
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
- }
- // chatml mode: insert assistant chat suffix
- if (params.chatml) {
- LOG("inserting chatml suffix\n");
- embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
- }
-
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
}
// end of generation
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
### Usage
```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
```
#include <string>
#include <vector>
-int main(int argc, char ** argv) {
- gpt_params params;
-
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
- return 1 ;
- }
-
- int seed = -1;
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
- int n_junk = 250; // number of times to repeat the junk text
- int n_keep = 32; // number of tokens in the prompt prefix
- int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
- int i_pos = -1; // position of the passkey in the junk text
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_junk = std::stoi(argv[2]);
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+ LOG_TEE("\n");
+}
- if (argc >= 4) {
- n_grp = std::stoi(argv[3]);
- }
+int main(int argc, char ** argv) {
+ gpt_params params;
- if (argc >= 5) {
- i_pos = std::stoi(argv[4]);
- }
+ params.n_junk = 250;
+ params.n_keep = 32;
+ params.i_pos = -1;
- if (argc >= 6) {
- seed = std::stoi(argv[5]);
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- if (seed == -1) {
- seed = time(NULL);
- }
+ srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
- srand(seed);
+ int n_junk = params.n_junk;
+ int n_keep = params.n_keep;
+ int n_grp = params.grp_attn_n;
+ int i_pos = params.i_pos;
if (i_pos == -1) {
i_pos = rand() % n_junk;
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = 99; // offload all layers to the GPU
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = seed;
- ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
- ctx_params.n_batch = 512;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
- llama_batch batch = llama_batch_init(512, 0, 1);
+ llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
int n_past = 0;
std::vector<llama_token> seq_tokens[2];
};
-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
std::vector<winogrande_entry> result;
std::istringstream in(prompt);
std::string line;
int main(int argc, char ** argv) {
gpt_params params;
+ params.n_ctx = 512;
+ params.logits_all = true;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
- params.logits_all = true;
-
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
}
const int n_ctx_train = llama_n_ctx_train(model);
+
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
echo
# 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
echo PASS
echo
echo
# 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
echo PASS
echo
#include <algorithm>
#include <fstream>
-struct retrieval_params {
- std::vector<std::string> context_files; // context files to embed
- int32_t chunk_size = 64; // chunk size for context embedding
- std::string chunk_separator = "\n"; // chunk separator for context embedding
-};
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
- gpt_params_print_usage(argc, argv, gpt_params);
- printf("retrieval options:\n");
- printf(" --context-file FNAME file containing context to embed.\n");
- printf(" specify multiple files by providing --context-file option multiple times.\n");
- printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
- printf(" --chunk-separator STRING\n");
- printf(" string to separate chunks (default: \"\\n\")\n");
- printf("\n");
-}
-
-static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
- int i = 1;
- std::string arg;
- while (i < argc) {
- arg = argv[i];
- bool invalid_gpt_param = false;
- if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
- if (invalid_gpt_param) {
- fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- // option was parsed by gpt_params_find_arg
- } else if (arg == "--context-file") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --context-file\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- // store the external file name in params
- retrieval_params.context_files.push_back(argv[i]);
- } else if (arg == "--chunk-size") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --chunk-size\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- retrieval_params.chunk_size = std::stoi(argv[i]);
- } else if (arg == "--chunk-separator") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --chunk-separator\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- retrieval_params.chunk_separator = argv[i];
- } else {
- // unknown argument
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- i++;
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+ LOG_TEE("\n");
}
struct chunk {
int main(int argc, char ** argv) {
gpt_params params;
- retrieval_params retrieval_params;
- retrieval_params_parse(argc, argv, params, retrieval_params);
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
+ }
// For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
+ params.embedding = true;
- if (retrieval_params.chunk_size <= 0) {
+ if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n");
return 1;
}
- if (retrieval_params.context_files.empty()) {
+ if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n");
return 1;
}
- params.embedding = true;
print_build_info();
printf("processing files:\n");
- for (auto & context_file : retrieval_params.context_files) {
+ for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str());
}
std::vector<chunk> chunks;
- for (auto & context_file : retrieval_params.context_files) {
- std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+ for (auto & context_file : params.context_files) {
+ std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
}
printf("Number of chunks: %ld\n", chunks.size());
return 1;
}
// add eos if not present
- if (inp.empty() || inp.back() != llama_token_eos(model)) {
+ if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
inp.push_back(llama_token_eos(model));
}
chunk.tokens = inp;
params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
json input_suffix;
};
-struct server_params {
- int32_t port = 8080;
- int32_t read_timeout = 600;
- int32_t write_timeout = 600;
- int32_t n_threads_http = -1;
-
- std::string hostname = "127.0.0.1";
- std::string public_path = "";
- std::string chat_template = "";
- std::string system_prompt = "";
-
- std::vector<std::string> api_keys;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- std::string ssl_key_file = "";
- std::string ssl_cert_file = "";
-#endif
-
- bool slots_endpoint = true;
- bool metrics_endpoint = false;
- std::string slot_save_path;
-};
-
struct server_slot {
int id;
int id_task = -1;
}
json get_formated_generation(const server_slot & slot) const {
- const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+ const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
}
};
-static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
- printf("usage: %s [options]\n", argv0);
- printf("\n");
- printf("options:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
- printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
- printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
- printf(" --rope-scaling {none,linear,yarn}\n");
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
- printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
- printf(" -dt N, --defrag-thold N\n");
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
- printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
- if (llama_supports_mlock()) {
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
- }
- if (llama_supports_mmap()) {
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
- }
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
- printf(" - distribute: spread execution evenly over all nodes\n");
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
- printf(" - numactl: use the CPU map provided my numactl\n");
- if (llama_supports_gpu_offload()) {
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row)\n");
- printf(" -nkvo, --no-kv-offload\n");
- printf(" disable KV offload\n");
- }
- printf(" -m FNAME, --model FNAME\n");
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: unused)\n");
- printf(" -hfr REPO, --hf-repo REPO\n");
- printf(" Hugging Face model repository (default: unused)\n");
- printf(" -hff FILE, --hf-file FILE\n");
- printf(" Hugging Face model file (default: unused)\n");
- printf(" -a ALIAS, --alias ALIAS\n");
- printf(" set an alias for the model, will be added as `model` field in completion response\n");
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
- printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
- printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
- printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
- printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
- printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
- printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
-#endif
- printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
- printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
- printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
- printf(" -spf FNAME, --system-prompt-file FNAME\n");
- printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
- printf(" KV cache data type for K (default: f16)\n");
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
- printf(" KV cache data type for V (default: f16)\n");
- printf(" --log-format log output format: json or text (default: json)\n");
- printf(" --log-disable disables logging to a file.\n");
- printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
- printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
- printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
- printf("\n");
- printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
- printf(" --override-kv KEY=TYPE:VALUE\n");
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
- printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
- printf(" --chat-template JINJA_TEMPLATE\n");
- printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
- printf(" only commonly used templates are accepted:\n");
- printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
- printf("\n");
-}
-
-static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
- gpt_params default_params;
- server_params default_sparams;
-
- std::string arg;
- bool invalid_param = false;
-
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
- if (arg == "--port") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.port = std::stoi(argv[i]);
- } else if (arg == "--rpc") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rpc_servers = argv[i];
- } else if (arg == "--host") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.hostname = argv[i];
- } else if (arg == "--path") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.public_path = argv[i];
- } else if (arg == "--api-key") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.api_keys.push_back(argv[i]);
- } else if (arg == "--api-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream key_file(argv[i]);
- if (!key_file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::string key;
- while (std::getline(key_file, key)) {
- if (key.size() > 0) {
- sparams.api_keys.push_back(key);
- }
- }
- key_file.close();
-
- }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- else if (arg == "--ssl-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.ssl_key_file = argv[i];
- } else if (arg == "--ssl-cert-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.ssl_cert_file = argv[i];
- }
-#endif
- else if (arg == "--timeout" || arg == "-to") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.read_timeout = std::stoi(argv[i]);
- sparams.write_timeout = std::stoi(argv[i]);
- } else if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model = argv[i];
- } else if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_url = argv[i];
- } else if (arg == "-hfr" || arg == "--hf-repo") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.hf_repo = argv[i];
- } else if (arg == "-hff" || arg == "--hf-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.hf_file = argv[i];
- } else if (arg == "-a" || arg == "--alias") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_alias = argv[i];
- } else if (arg == "-h" || arg == "--help") {
- server_print_usage(argv[0], default_params, default_sparams);
- exit(0);
- } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ctx = std::stoi(argv[i]);
- } else if (arg == "--rope-scaling") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
- else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
- else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
- else { invalid_param = true; break; }
- } else if (arg == "--rope-freq-base") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_base = std::stof(argv[i]);
- } else if (arg == "--rope-freq-scale") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_scale = std::stof(argv[i]);
- } else if (arg == "--yarn-ext-factor") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_ext_factor = std::stof(argv[i]);
- }
- else if (arg == "--yarn-attn-factor") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_attn_factor = std::stof(argv[i]);
- } else if (arg == "--yarn-beta-fast") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_fast = std::stof(argv[i]);
- } else if (arg == "--yarn-beta-slow") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_slow = std::stof(argv[i]);
- } else if (arg == "--pooling") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
- else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
- else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
- else { invalid_param = true; break; }
- } else if (arg == "--defrag-thold" || arg == "-dt") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.defrag_thold = std::stof(argv[i]);
- } else if (arg == "--threads" || arg == "-t") {
- if (++i >= argc)
- {
- invalid_param = true;
- break;
- }
- params.n_threads = std::stoi(argv[i]);
- } else if (arg == "--grp-attn-n" || arg == "-gan") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_n = std::stoi(argv[i]);
- } else if (arg == "--grp-attn-w" || arg == "-gaw") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_w = std::stoi(argv[i]);
- } else if (arg == "--threads-batch" || arg == "-tb") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads_batch = std::stoi(argv[i]);
- } else if (arg == "--threads-http") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.n_threads_http = std::stoi(argv[i]);
- } else if (arg == "-b" || arg == "--batch-size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_batch = std::stoi(argv[i]);
- } else if (arg == "-ub" || arg == "--ubatch-size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ubatch = std::stoi(argv[i]);
- } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (llama_supports_gpu_offload()) {
- params.n_gpu_layers = std::stoi(argv[i]);
- } else {
- LOG_WARNING(
- "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
- "See main README.md for information on enabling GPU BLAS support",
- {{"n_gpu_layers", params.n_gpu_layers}});
- }
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
- params.no_kv_offload = true;
- } else if (arg == "--split-mode" || arg == "-sm") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string arg_next = argv[i];
- if (arg_next == "none") {
- params.split_mode = LLAMA_SPLIT_MODE_NONE;
- } else if (arg_next == "layer") {
- params.split_mode = LLAMA_SPLIT_MODE_LAYER;
- } else if (arg_next == "row") {
- params.split_mode = LLAMA_SPLIT_MODE_ROW;
- } else {
- invalid_param = true;
- break;
- }
-#ifndef GGML_USE_CUDA
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
- } else if (arg == "--tensor-split" || arg == "-ts") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- std::string arg_next = argv[i];
-
- // split string by , and /
- const std::regex regex{R"([,/]+)"};
- std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
- std::vector<std::string> split_arg{it, {}};
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
-
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
- if (i_device < split_arg.size()) {
- params.tensor_split[i_device] = std::stof(split_arg[i_device]);
- } else {
- params.tensor_split[i_device] = 0.0f;
- }
- }
-#else
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
- } else if (arg == "--main-gpu" || arg == "-mg") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- params.main_gpu = std::stoi(argv[i]);
-#else
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
-#endif
- } else if (arg == "--lora") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(argv[i], 1.0f);
- params.use_mmap = false;
- } else if (arg == "--lora-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- const char * lora_adapter = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
- params.use_mmap = false;
- } else if (arg == "--lora-base") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_base = argv[i];
- } else if (arg == "-v" || arg == "--verbose") {
-#if SERVER_VERBOSE != 1
- LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
- server_verbose = true;
-#endif
- } else if (arg == "--mlock") {
- params.use_mlock = true;
- } else if (arg == "--no-mmap") {
- params.use_mmap = false;
- } else if (arg == "--numa") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- } else {
- std::string value(argv[i]);
- /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
- else { invalid_param = true; break; }
- }
- } else if (arg == "--embedding" || arg == "--embeddings") {
- params.embedding = true;
- } else if (arg == "-cb" || arg == "--cont-batching") {
- params.cont_batching = true;
- } else if (arg == "-fa" || arg == "--flash-attn") {
- params.flash_attn = true;
- } else if (arg == "-np" || arg == "--parallel") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_parallel = std::stoi(argv[i]);
- } else if (arg == "-n" || arg == "--n-predict") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_predict = std::stoi(argv[i]);
- } else if (arg == "-spf" || arg == "--system-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::string system_prompt;
- std::copy(
- std::istreambuf_iterator<char>(file),
- std::istreambuf_iterator<char>(),
- std::back_inserter(system_prompt)
- );
- sparams.system_prompt = system_prompt;
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
- params.cache_type_k = argv[++i];
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
- params.cache_type_v = argv[++i];
- } else if (arg == "--log-format") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (std::strcmp(argv[i], "json") == 0) {
- server_log_json = true;
- } else if (std::strcmp(argv[i], "text") == 0) {
- server_log_json = false;
- } else {
- invalid_param = true;
- break;
- }
- } else if (arg == "--log-disable") {
- log_set_target(stdout);
- LOG_INFO("logging to file is disabled.", {});
- } else if (arg == "--slots-endpoint-disable") {
- sparams.slots_endpoint = false;
- } else if (arg == "--metrics") {
- sparams.metrics_endpoint = true;
- } else if (arg == "--slot-save-path") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.slot_save_path = argv[i];
- // if doesn't end with DIRECTORY_SEPARATOR, add it
- if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
- sparams.slot_save_path += DIRECTORY_SEPARATOR;
- }
- } else if (arg == "--chat-template") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (!verify_custom_template(argv[i])) {
- fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
- fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
- invalid_param = true;
- break;
- }
- sparams.chat_template = argv[i];
- } else if (arg == "--override-kv") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
- fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- server_print_usage(argv[0], default_params, default_sparams);
- exit(1);
- }
- }
-
- gpt_params_handle_model_default(params);
-
- if (!params.kv_overrides.empty()) {
- params.kv_overrides.emplace_back();
- params.kv_overrides.back().key[0] = 0;
- }
-
- if (invalid_param) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- server_print_usage(argv[0], default_params, default_sparams);
- exit(1);
- }
-}
-
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
// skip GH copilot requests when using default port
if (req.path == "/v1/health" || req.path == "/v1/completions") {
log_disable();
#endif
// own arguments required by this example
- gpt_params params;
- server_params sparams;
+ gpt_params params;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
+ return 1;
+ }
+
+ // TODO: not great to use extern vars
+ server_log_json = params.log_json;
+ server_verbose = params.verbose;
// struct that contains llama context and inference
server_context ctx_server;
- server_params_parse(argc, argv, sparams, params);
-
- if (!sparams.system_prompt.empty()) {
- ctx_server.system_prompt_set(sparams.system_prompt);
+ if (!params.system_prompt.empty()) {
+ ctx_server.system_prompt_set(params.system_prompt);
}
if (params.model_alias == "unknown") {
std::unique_ptr<httplib::Server> svr;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
- LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
+ if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+ LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
svr.reset(
- new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
+ new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
);
} else {
LOG_INFO("Running without SSL", {});
});
// set timeouts and change hostname and port
- svr->set_read_timeout (sparams.read_timeout);
- svr->set_write_timeout(sparams.write_timeout);
+ svr->set_read_timeout (params.timeout_read);
+ svr->set_write_timeout(params.timeout_write);
- if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
- fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
+ if (!svr->bind_to_port(params.hostname, params.port)) {
+ fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
return 1;
}
std::unordered_map<std::string, std::string> log_data;
- log_data["hostname"] = sparams.hostname;
- log_data["port"] = std::to_string(sparams.port);
+ log_data["hostname"] = params.hostname;
+ log_data["port"] = std::to_string(params.port);
- if (sparams.api_keys.size() == 1) {
- auto key = sparams.api_keys[0];
+ if (params.api_keys.size() == 1) {
+ auto key = params.api_keys[0];
log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
- } else if (sparams.api_keys.size() > 1) {
- log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
+ } else if (params.api_keys.size() > 1) {
+ log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
}
// load the model
const auto model_meta = ctx_server.model_meta();
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
- if (sparams.chat_template.empty()) {
+ if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "chatml";
+ params.chat_template = "chatml";
}
}
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
- const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+ const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
LOG_INFO("chat template", {
{"chat_example", chat_example},
- {"built_in", sparams.chat_template.empty()},
+ {"built_in", params.chat_template.empty()},
});
}
// Middlewares
//
- auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
static const std::set<std::string> protected_endpoints = {
"/props",
};
// If API key is not set, skip validation
- if (sparams.api_keys.empty()) {
+ if (params.api_keys.empty()) {
return true;
}
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
- if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+ if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
return true; // API key is valid
}
}
};
res.status = 200; // HTTP OK
- if (sparams.slots_endpoint && req.has_param("include_slots")) {
+ if (params.endpoint_slots && req.has_param("include_slots")) {
health["slots"] = result.data.at("slots");
}
};
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
- if (!sparams.slots_endpoint) {
+ if (!params.endpoint_slots) {
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
return;
}
};
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
- if (!sparams.metrics_endpoint) {
+ if (!params.endpoint_metrics) {
res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
return;
}
res.status = 200; // HTTP OK
};
- const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+ const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data.at("filename");
if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
- std::string filepath = sparams.slot_save_path + filename;
+ std::string filepath = params.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
}
};
- const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+ const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data.at("filename");
if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
- std::string filepath = sparams.slot_save_path + filename;
+ std::string filepath = params.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
res.set_content(models.dump(), "application/json; charset=utf-8");
};
- const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
+ json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
const int id_task = ctx_server.queue_tasks.get_new_id();
//
// register static assets routes
- if (!sparams.public_path.empty()) {
+ if (!params.public_path.empty()) {
// Set the base directory for serving static files
- svr->set_base_dir(sparams.public_path);
+ svr->set_base_dir(params.public_path);
}
+
// using embedded static files
- svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
- svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
- json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+ svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+ svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
// add new-ui files
- svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
- svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
+ svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
+ svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
- svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
- svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
+ svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
+ svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
// register API routes
svr->Get ("/health", handle_health);
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
- if (!sparams.slot_save_path.empty()) {
+ if (!params.slot_save_path.empty()) {
// only enable slot endpoints if slot_save_path is set
svr->Post("/slots/:id_slot", handle_slots_action);
}
//
// Start the server
//
- if (sparams.n_threads_http < 1) {
+ if (params.n_threads_http < 1) {
// +2 threads for monitoring endpoints
- sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+ params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
}
- log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
- svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+ log_data["n_threads_http"] = std::to_string(params.n_threads_http);
+ svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
LOG_INFO("HTTP server listening", log_data);
// chat template utils
//
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
- llama_chat_message chat[] = {{"user", "test"}};
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
- return res >= 0;
-}
-
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
size_t alloc_size = 0;
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
...
#include <string>
#include <vector>
-int main(int argc, char ** argv) {
- gpt_params params;
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
- return 1 ;
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+ LOG_TEE("\n");
+}
- if (argc >= 2) {
- params.model = argv[1];
- }
+int main(int argc, char ** argv) {
+ gpt_params params;
- if (argc >= 3) {
- params.prompt = argv[2];
- }
+ params.prompt = "Hello my name is";
+ params.n_predict = 32;
- if (params.prompt.empty()) {
- params.prompt = "Hello my name is";
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
// total length of the sequence including the prompt
- const int n_len = 32;
+ const int n_predict = params.n_predict;
// init LLM
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
-
- ctx_params.seed = 1234;
- ctx_params.n_ctx = 2048;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
- LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
+ LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}
const auto t_main_start = ggml_time_us();
- while (n_cur <= n_len) {
+ while (n_cur <= n_predict) {
// sample the next token
{
auto n_vocab = llama_n_vocab(model);
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation?
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
LOG_TEE("\n");
break;
int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
//
LLAMA_ATTRIBUTE_FORMAT(2, 3)
-static void llama_log_internal (ggml_log_level level, const char* format, ...);
+static void llama_log_internal (ggml_log_level level, const char * format, ...);
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
CLI_ARGS_MAIN_PERPLEXITY = [
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
- "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
+ "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
- "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
+ "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
"simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
"verbose-prompt"