//
static int llama_decode_internal(
llama_context & lctx,
- llama_batch batch_all) { // TODO: rename back to batch
+ llama_batch batch) {
lctx.is_encoding = false;
- const uint32_t n_tokens_all = batch_all.n_tokens;
+ const uint32_t n_tokens_all = batch.n_tokens;
if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
- GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
- if (batch_all.token) {
+ if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
}
lctx.embd_seq.clear();
// count outputs
- if (batch_all.logits && !embd_pooled) {
+ if (batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
- n_outputs += batch_all.logits[i] != 0;
+ n_outputs += batch.logits[i] != 0;
}
} else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch_all, n_embd,
+ lctx.sbatch.from_batch(batch, n_embd,
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);