params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
+ } else if (arg == "-cml" || arg == "--chatml") {
+ params.chatml = true;
} else if (arg == "--infill") {
params.infill = true;
} else if (arg == "--multiline-input") {
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
+ printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
printf(" halt generation at PROMPT, return control in interactive mode\n");
std::vector<llama_token> embd_inp;
- if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+ if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
+ if (params.chatml) {
+ params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
+ }
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
} else {
LOG("use session tokens\n");
}
// number of tokens to keep when resetting context
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
+ // chatml prefix & suffix
+ const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
+ const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
+
+ LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
+ LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
+
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
+ // similar for chatml mode
+ else if (params.chatml) {
+ params.interactive_first = true;
+ params.antiprompt.push_back("<|im_start|>user\n");
+ }
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
is_interacting = true;
printf("\n");
- } else if (params.instruct) {
+ } else if (params.instruct || params.chatml) {
is_interacting = true;
}
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
- if (params.instruct) {
+ if (params.instruct || params.chatml) {
printf("\n> ");
}
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
+ // chatml mode: insert user chat prefix
+ if (params.chatml && !is_antiprompt) {
+ LOG("inserting chatml prefix\n");
+ n_consumed = embd_inp.size();
+ embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
+ }
if (params.escape) {
process_escapes(buffer);
}
LOG("inserting instruction suffix\n");
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
+ // chatml mode: insert assistant chat suffix
+ if (params.chatml) {
+ LOG("inserting chatml suffix\n");
+ embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
+ }
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
}
// end of text token
- if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
+ if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
LOG_TEE(" [end of text]\n");
break;
}