// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
- // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+ // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
//printf("\n---\n");
}
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return 1;
+ // evaluate tokens in batches
+ // embd is typically prepared beforehand to fit within a batch, but not always
+ for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+ int n_eval = (int) embd.size() - i;
+ if (n_eval > params.n_batch) {
+ n_eval = params.n_batch;
+ }
+ if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ n_past += n_eval;
}
}
- n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {