}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
CHECK_ARG
params.input_prefix = argv[i];
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
CHECK_ARG
params.input_suffix = argv[i];
+ params.enable_chat_template = false;
return true;
}
if (arg == "--spm-infill") {
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
- options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
+ std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+ ss << "\n";
+ };
+ // format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
- auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
- return formatted;
+ // get the diff part
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+ return ss.str();
}
std::string llama_chat_format_example(const struct llama_model * model,
std::vector<llama_token> embd_inp;
{
- auto prompt = params.conversation
+ auto prompt = (params.conversation && params.enable_chat_template)
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
is_antiprompt = true;
}
- chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+ if (params.enable_chat_template) {
+ chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+ }
is_interacting = true;
printf("\n");
}
string_process_escapes(buffer);
}
- std::string user_inp = params.conversation
+ bool format_chat = params.conversation && params.enable_chat_template;
+ std::string user_inp = format_chat
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
- const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
+ const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());