From: Georgi Gerganov Date: Fri, 13 Jun 2025 10:47:55 +0000 (+0300) Subject: batch : rework llama_batch_allocr (#14153) X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=60c666347becacff81cd4bc9a52038ba71038e41;p=pkg%2Fggml%2Fsources%2Fllama.cpp batch : rework llama_batch_allocr (#14153) * batch : rework llama_batch_allocr ggml-ci * cont : move validation inside class ggml-ci * cont : move output counting to class ggml-ci * cont : minor ggml-ci * batch : add TODOs ggml-ci --- diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 69e0d754..9066d5a9 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -1,5 +1,9 @@ #include "llama-batch.h" +#include "llama-impl.h" +#include "llama-cparams.h" +#include "llama-vocab.h" + #include #include #include @@ -279,9 +283,42 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple ); } -llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) { - batch = in_batch; +llama_batch_allocr::llama_batch_allocr() = default; + +bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) { + clear(); + + batch = batch_inp; + GGML_ASSERT(batch.n_tokens > 0); + + if (!batch.pos) { + if (batch.seq_id) { + LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); + return false; + } + } + + if (batch.token) { + for (int32_t i = 0; i < batch.n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return false; + } + } + } + + if (batch.seq_id) { + for (int32_t i = 0; i < batch.n_tokens; ++i) { + for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) { + if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { + LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES); + return false; + } + } + } + } + if (!batch.pos) { assert(p0 >= 0); pos.resize(batch.n_tokens); @@ -290,6 +327,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.pos = pos.data(); } + if (!batch.n_seq_id) { n_seq_id.resize(batch.n_tokens); for (int32_t i = 0; i < batch.n_tokens; i++) { @@ -297,6 +335,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.n_seq_id = n_seq_id.data(); } + if (!batch.seq_id) { seq_id.resize(batch.n_tokens + 1); seq_id[batch.n_tokens] = NULL; @@ -305,12 +344,37 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.seq_id = seq_id.data(); } + if (!batch.logits) { // by default return the output only for the last token output.resize(batch.n_tokens); output[output.size() - 1] = true; batch.logits = output.data(); } + + for (int32_t i = 0; i < batch.n_tokens; ++i) { + n_outputs += batch.logits[i] != 0; + } + + return true; +} + +const llama_batch & llama_batch_allocr::get_batch() const { + return batch; +} + +uint32_t llama_batch_allocr::get_n_outputs() const { + return n_outputs; +} + +void llama_batch_allocr::clear() { + n_outputs = 0; + + batch = {}; + pos.clear(); + n_seq_id.clear(); + seq_id.clear(); + output.clear(); } // diff --git a/src/llama-batch.h b/src/llama-batch.h index 7ad82b52..24340b00 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -18,8 +18,8 @@ struct llama_ubatch { llama_token * token; // [n_tokens] float * embd; // [n_embd, n_tokens] llama_pos * pos; // [n_tokens] - int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence - llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id; + int32_t * n_seq_id; // [n_seqs] + llama_seq_id ** seq_id; // [n_seqs] int8_t * output; // [n_tokens] }; @@ -78,15 +78,28 @@ struct llama_sbatch { }; // temporary allocate memory for the input batch if needed -struct llama_batch_allocr { - struct llama_batch batch; +class llama_batch_allocr { +public: + llama_batch_allocr(); + + // optionally fulfill the batch returned by llama_batch_get_one + bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0); + + const llama_batch & get_batch() const; + + uint32_t get_n_outputs() const; + +private: + void clear(); + + llama_batch batch; + + uint32_t n_outputs; std::array seq_id_0 = { 0 }; // default sequence id + std::vector pos; std::vector n_seq_id; std::vector seq_id; std::vector output; - - // optionally fulfill the batch returned by llama_batch_get_one - llama_batch_allocr(struct llama_batch in_batch, llama_pos p0); }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2e551bf6..ec1e1189 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,6 +1,7 @@ #include "llama-context.h" #include "llama-impl.h" +#include "llama-batch.h" #include "llama-io.h" #include "llama-memory.h" #include "llama-mmap.h" @@ -18,7 +19,8 @@ llama_context::llama_context( const llama_model & model, llama_context_params params) : - model(model) { + model(model), + batch_allocr(std::make_unique()) { LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); t_start_us = model.t_start_us; @@ -494,7 +496,7 @@ float * llama_context::get_logits() { } float * llama_context::get_logits_ith(int32_t i) { - int32_t j = -1; + int64_t j = -1; try { if (logits == nullptr) { @@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) { } if (j >= n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } return logits + j*model.vocab.n_tokens(); @@ -536,7 +538,7 @@ float * llama_context::get_embeddings() { } float * llama_context::get_embeddings_ith(int32_t i) { - int32_t j = -1; + int64_t j = -1; try { if (embd == nullptr) { @@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { } if (j >= n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } return embd + j*model.hparams.n_embd; @@ -719,40 +721,27 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, return res; } -int llama_context::encode(llama_batch & inp_batch) { - if (inp_batch.n_tokens == 0) { +int llama_context::encode(const llama_batch & batch_inp) { + if (batch_inp.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } // temporary allocate memory for the input batch if needed // note: during encode, we always pass the full sequence starting from pos = 0 - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0); + if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) { + LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); + return -1; + } - const llama_batch & batch = batch_allocr.batch; - const int32_t n_tokens = batch.n_tokens; + const llama_batch & batch = batch_allocr->get_batch(); - const auto & hparams = model.hparams; + const uint32_t n_tokens = batch.n_tokens; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - // TODO: move the validation to the llama_batch_allocr - if (batch.token) { - for (int32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - - if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES); - throw -1; - } - } - } - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); + GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); if (t_compute_start_us == 0) { t_compute_start_us = ggml_time_us(); @@ -763,6 +752,8 @@ int llama_context::encode(llama_batch & inp_batch) { n_queued_tokens += n_tokens; + const auto & hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true); @@ -775,7 +766,7 @@ int llama_context::encode(llama_batch & inp_batch) { return -2; }; - for (int32_t i = 0; i < n_tokens; ++i) { + for (uint32_t i = 0; i < n_tokens; ++i) { output_ids[i] = i; } @@ -831,7 +822,8 @@ int llama_context::encode(llama_batch & inp_batch) { GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - for (int32_t i = 0; i < n_tokens; i++) { + // TODO: fix indexing [UBATCH_IDX] + for (uint32_t i = 0; i < n_tokens; i++) { const llama_seq_id seq_id = ubatch.seq_id[i][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; @@ -846,6 +838,7 @@ int llama_context::encode(llama_batch & inp_batch) { auto & embd_seq_out = embd_seq; const uint32_t n_cls_out = hparams.n_cls_out; + // TODO: fix indexing [UBATCH_IDX] for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { @@ -878,13 +871,11 @@ int llama_context::encode(llama_batch & inp_batch) { memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd)); // remember the sequence ids used during the encoding - needed for cross attention later - // TODO: the seuqence indexing here is likely not correct in the general case - // probably works only for split_simple cross.seq_ids_enc.resize(n_tokens); - for (int32_t i = 0; i < n_tokens; i++) { + for (uint32_t i = 0; i < n_tokens; i++) { cross.seq_ids_enc[i].clear(); - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; + for (int s = 0; s < batch.n_seq_id[i]; s++) { + llama_seq_id seq_id = batch.seq_id[i][s]; cross.seq_ids_enc[i].insert(seq_id); } } @@ -893,68 +884,44 @@ int llama_context::encode(llama_batch & inp_batch) { return 0; } -int llama_context::decode(llama_batch & inp_batch) { +int llama_context::decode(const llama_batch & batch_inp) { if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); - return encode(inp_batch); + return encode(batch_inp); } - if (inp_batch.n_tokens == 0) { + if (batch_inp.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } - if (!inp_batch.pos) { - if (inp_batch.seq_id) { - LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); - return -1; - } - } - // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1); + if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) { + LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); + return -1; + } - const llama_batch & batch = batch_allocr.batch; + const llama_batch & batch = batch_allocr->get_batch(); const auto & vocab = model.vocab; const auto & hparams = model.hparams; const int32_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; + const uint32_t n_tokens_all = batch.n_tokens; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - // TODO: move the validation to the llama_batch_allocr - if (batch.token) { - for (int64_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); - return -1; - } - - if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES); - return -1; - } - } - } - // this indicates we are doing pooled embedding const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - int64_t n_outputs_all = 0; - - // count outputs - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs_all += batch.logits[i] != 0; - } + const uint32_t n_outputs_all = batch_allocr->get_n_outputs(); if (embd_pooled) { // require that all tokens are output if (n_outputs_all != n_tokens_all) { - LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n", + LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n", __func__, n_outputs_all, n_tokens_all); return -1; } @@ -1024,7 +991,7 @@ int llama_context::decode(llama_batch & inp_batch) { // reserve output buffer if (output_reserve(n_outputs_all) < n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); return -2; }; @@ -1063,6 +1030,7 @@ int llama_context::decode(llama_batch & inp_batch) { pos_min[s] = std::numeric_limits::max(); } + // TODO: fix sequence indexing for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { const auto & seq_id = ubatch.seq_id[i][0]; @@ -1176,14 +1144,14 @@ int llama_context::decode(llama_batch & inp_batch) { n_outputs = n_outputs_all; // set output mappings - { + if (n_outputs > 0) { bool sorted_output = true; auto & out_ids = mstate->out_ids(); - GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all); + GGML_ASSERT(out_ids.size() == (size_t) n_outputs); - for (int64_t i = 0; i < n_outputs_all; ++i) { + for (int64_t i = 0; i < n_outputs; ++i) { int64_t out_id = out_ids[i]; output_ids[out_id] = i; if (out_id != i) { @@ -1195,20 +1163,22 @@ int llama_context::decode(llama_batch & inp_batch) { // note: this is mostly relevant for recurrent models atm if (!sorted_output) { const uint32_t n_vocab = model.vocab.n_tokens(); - const uint32_t n_embd = model.hparams.n_embd; + const uint64_t n_embd = model.hparams.n_embd; GGML_ASSERT((size_t) n_outputs == out_ids.size()); // TODO: is there something more efficient which also minimizes swaps? // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { + for (uint32_t i = 0; i < n_outputs - 1; ++i) { + uint32_t j_min = i; + for (uint32_t j = i + 1; j < n_outputs; ++j) { if (out_ids[j] < out_ids[j_min]) { j_min = j; } } - if (j_min == i) { continue; } + if (j_min == i) { + continue; + } std::swap(out_ids[i], out_ids[j_min]); if (logits_size > 0) { for (uint32_t k = 0; k < n_vocab; k++) { @@ -1221,8 +1191,10 @@ int llama_context::decode(llama_batch & inp_batch) { } } } + std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { + + for (uint32_t i = 0; i < n_outputs; ++i) { output_ids[out_ids[i]] = i; } } @@ -1242,7 +1214,7 @@ int llama_context::decode(llama_batch & inp_batch) { // output // -int32_t llama_context::output_reserve(int32_t n_outputs) { +uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; @@ -1308,8 +1280,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - this->n_outputs = 0; - this->n_outputs_max = n_outputs_max; + this->n_outputs = 0; return n_outputs_max; } @@ -1800,14 +1771,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { std::vector w_output_pos; - GGML_ASSERT(n_outputs <= n_outputs_max); - w_output_pos.resize(n_outputs); // build a more compact representation of the output ids for (size_t i = 0; i < n_batch(); ++i) { // map an output id to a position in the batch - int32_t pos = output_ids[i]; + int64_t pos = output_ids[i]; if (pos >= 0) { GGML_ASSERT(pos < n_outputs); w_output_pos[pos] = i; @@ -2082,7 +2051,7 @@ void llama_context::opt_epoch_iter( embd_seq.clear(); - int64_t n_outputs_all = n_tokens_all; + uint32_t n_outputs_all = n_tokens_all; auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled); if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { @@ -2092,7 +2061,7 @@ void llama_context::opt_epoch_iter( // reserve output buffer if (output_reserve(n_outputs_all) < n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); GGML_ABORT("TODO: handle this error"); }; diff --git a/src/llama-context.h b/src/llama-context.h index 2e0da8c8..040f03ae 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -1,7 +1,6 @@ #pragma once #include "llama.h" -#include "llama-batch.h" #include "llama-cparams.h" #include "llama-graph.h" #include "llama-adapter.h" @@ -13,6 +12,7 @@ #include struct llama_model; +class llama_batch_allocr; class llama_io_read_i; class llama_io_write_i; @@ -102,8 +102,8 @@ struct llama_context { llama_memory_state_i * mstate, ggml_status & ret); - int encode(llama_batch & inp_batch); - int decode(llama_batch & inp_batch); + int encode(const llama_batch & batch_inp); + int decode(const llama_batch & batch_inp); // // state save/load @@ -181,7 +181,7 @@ private: // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. - int32_t output_reserve(int32_t n_outputs); + uint32_t output_reserve(int32_t n_outputs); // // graph @@ -246,8 +246,10 @@ private: // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch - int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers + // reuse the batch_allocr to avoid unnecessary memory allocations + std::unique_ptr batch_allocr; + + uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch std::vector output_ids; // map batch token positions to ids of the logits and embd buffers diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4493fb16..337fb5cb 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { std::vector sum(n_tokens, 0); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { } } + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { uint32_t * data = (uint32_t *) cls->data; memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -283,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const int32_t ti = s0*n_seq_tokens + i; float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) { if (hparams.use_alibi) { @@ -322,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const int32_t ti = s0*n_seq_tokens + i; float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { if (ubatch->seq_id[s0][s] == seq_id) { if (hparams.use_alibi) { @@ -377,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_enc; ++i) { float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[j][s]; if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) { diff --git a/src/llama-graph.h b/src/llama-graph.h index 88fb77f1..87813119 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -378,7 +378,7 @@ struct llm_graph_params { const llama_memory_state_i * mstate; const llama_cross * cross; - int32_t n_outputs; + uint32_t n_outputs; const llm_graph_cb & cb; }; @@ -412,8 +412,8 @@ struct llm_graph_context { const float norm_eps; const float norm_rms_eps; - const int32_t n_tokens; - const int32_t n_outputs; + const int64_t n_tokens; + const int64_t n_outputs; const int32_t n_ctx_orig; // yarn const enum llama_pooling_type pooling_type; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 89606c59..d4e92eab 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -674,6 +674,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch cells.pos_set(head_cur + idx, ubatch.pos[idx]); + // TODO: fix indexing [UBATCH_IDX] for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) { cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]); }