// in the order they have appeared in the batch.
// Rows: number of tokens for which llama_batch.logits[i] != 0
// Cols: n_vocab
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Logits for the ith token. For positive indices, Equivalent to:
// in the order they have appeared in the batch.
// shape: [n_outputs*n_embd]
// Otherwise, returns NULL.
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Get the embeddings for the ith token. For positive indices, Equivalent to:
}
float * llama_context::get_logits() {
+ output_reorder();
+
return logits;
}
float * llama_context::get_logits_ith(int32_t i) {
int64_t j = -1;
+ output_reorder();
+
try {
if (logits == nullptr) {
throw std::runtime_error("no logits");
}
float * llama_context::get_embeddings() {
+ output_reorder();
+
return embd;
}
float * llama_context::get_embeddings_ith(int32_t i) {
int64_t j = -1;
+ output_reorder();
+
try {
if (embd == nullptr) {
throw std::runtime_error("no embeddings");
// TODO: this clear of the buffer can easily be forgotten - need something better
embd_seq.clear();
+ output_swaps.clear();
bool did_optimize = false;
// make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm
if (!sorted_output) {
- const uint32_t n_vocab = model.vocab.n_tokens();
- const uint64_t n_embd = model.hparams.n_embd;
-
GGML_ASSERT((size_t) n_outputs == out_ids.size());
// TODO: is there something more efficient which also minimizes swaps?
continue;
}
std::swap(out_ids[i], out_ids[j_min]);
- if (logits_size > 0) {
- for (uint32_t k = 0; k < n_vocab; k++) {
- std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
- }
- }
- if (embd_size > 0) {
- for (uint32_t k = 0; k < n_embd; k++) {
- std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
- }
- }
+
+ // remember the swaps and apply them lazily upon logits/embeddings access
+ output_swaps.push_back({ i, j_min });
}
std::fill(output_ids.begin(), output_ids.end(), -1);
return n_outputs_max;
}
+void llama_context::output_reorder() {
+ const uint32_t n_vocab = model.vocab.n_tokens();
+ const uint64_t n_embd = model.hparams.n_embd;
+
+ for (uint32_t s = 0; s < output_swaps.size(); ++s) {
+ const uint32_t i0 = output_swaps[s].i0;
+ const uint32_t i1 = output_swaps[s].i1;
+
+ if (logits_size > 0) {
+ for (uint32_t k = 0; k < n_vocab; k++) {
+ std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
+ }
+ }
+
+ if (embd_size > 0) {
+ for (uint32_t k = 0; k < n_embd; k++) {
+ std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
+ }
+ }
+ }
+
+ output_swaps.clear();
+}
+
//
// graph
//
// Returns max number of outputs for which space was reserved.
uint32_t output_reserve(int32_t n_outputs);
+ void output_reorder();
+
//
// graph
//
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+ struct swap_info {
+ uint32_t i0;
+ uint32_t i1;
+ };
+
+ std::vector<swap_info> output_swaps;
+
ggml_backend_sched_ptr sched;
ggml_backend_t backend_cpu = nullptr;