params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-t" || arg == "--threads") {
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
- } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
- params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-p" || arg == "--prompt") {
params.prompt = get_next_arg(i, argc, argv, arg, params);
} else if (arg == "-n" || arg == "--n_predict") {
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-c" || arg == "--context") {
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
+ } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
+ params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
+ } else if (arg == "--ignore-eos") {
+ params.ignore_eos = true;
} else if (arg == "-m" || arg == "--model") {
params.model = get_next_arg(i, argc, argv, arg, params);
} else if (arg == "-i" || arg == "--interactive") {
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
- fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
+ fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
+ fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
//
struct gpt_params {
- int32_t seed = -1; // RNG seed
- int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
- int32_t n_predict = 200; // new tokens to predict
- int32_t n_parallel = 1; // number of parallel streams
- int32_t n_batch = 8; // batch size for prompt processing
- int32_t n_ctx = 2048; // context size (this is the KV cache max size)
+ int32_t seed = -1; // RNG seed
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+ int32_t n_predict = 200; // new tokens to predict
+ int32_t n_parallel = 1; // number of parallel streams
+ int32_t n_batch = 8; // batch size for prompt processing
+ int32_t n_ctx = 2048; // context size (this is the KV cache max size)
+ int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
+
+ bool ignore_eos = false; // ignore EOS token when generating text
// sampling parameters
int32_t top_k = 40;
bool interactive = false;
int32_t interactive_port = -1;
-
- int32_t n_gpu_layers = 0;
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);