params.numa = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
+ } else if (arg == "--no-display-prompt") {
+ params.display_prompt = false;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#endif
+ printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
+ printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
printf(" -gaw N, --grp-attn-w N\n");
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
- printf(" --verbose-prompt print prompt before generation\n");
printf(" -dkvc, --dump-kv-cache\n");
printf(" verbose print of the KV cache\n");
printf(" -nkvo, --no-kv-offload\n");
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
+ fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
}
//
bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
+ bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool is_antiprompt = false;
bool input_echo = true;
+ bool display = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
int n_past = 0;
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
+ display = params.display_prompt;
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
}
// display text
- if (input_echo) {
+ if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
+ display = true;
}
// if not currently processing queued inputs;
// color user input only
console::set_display(console::user_input);
+ display = params.display_prompt;
std::string line;
bool another_line = true;
// done taking input, reset color
console::set_display(console::reset);
+ display = true;
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back