float * llama_context::get_logits() {
output_reorder();
- return logits;
+ return logits.data;
}
int64_t llama_context::output_resolve_row(int32_t i) const {
output_reorder();
try {
- if (logits == nullptr) {
+ if (logits.data == nullptr) {
throw std::runtime_error("no logits");
}
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}
- return logits + j*model.vocab.n_tokens();
+ return logits.data + j*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
float * llama_context::get_embeddings() {
output_reorder();
- return embd;
+ return embd.data;
}
llama_token * llama_context::get_sampled_tokens() const{
- return sampling.sampled;
+ return sampling.sampled.data;
}
float * llama_context::get_embeddings_ith(int32_t i) {
output_reorder();
try {
- if (embd == nullptr) {
+ if (embd.data == nullptr) {
throw std::runtime_error("no embeddings");
}
}
const uint32_t n_embd_out = model.hparams.n_embd_out();
- return embd + j*n_embd_out;
+ return embd.data + j*n_embd_out;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
output_reorder();
- if (sampling.sampled == nullptr) {
+ if (!sampling.sampled.has_data()) {
return LLAMA_TOKEN_NULL;
}
try {
const int64_t row = output_resolve_row(idx);
- GGML_ASSERT(row < (int64_t) sampling.sampled_size);
- return sampling.sampled[row];
+ GGML_ASSERT(row < (int64_t) sampling.sampled.size);
+ return sampling.sampled.data[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
return LLAMA_TOKEN_NULL;
float * llama_context::get_sampled_probs_ith(int32_t idx) {
output_reorder();
- if (sampling.probs == nullptr) {
+ if (!sampling.probs.has_data()) {
return nullptr;
}
if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
return nullptr;
}
- return sampling.probs + row*model.vocab.n_tokens();
+ return sampling.probs.data + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
float * llama_context::get_sampled_logits_ith(int32_t idx) {
output_reorder();
- if (sampling.logits == nullptr) {
+ if (!sampling.logits.has_data()) {
return nullptr;
}
if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
return nullptr;
}
- return sampling.logits + row*model.vocab.n_tokens();
+ return sampling.logits.data + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
try {
const int64_t row = output_resolve_row(idx);
- if (sampling.candidates != nullptr &&
+ if (sampling.candidates.has_data() &&
(size_t) row < sampling.candidates_count.size() &&
sampling.candidates_count[row] > 0) {
- return sampling.candidates + row*model.vocab.n_tokens();
+ return sampling.candidates.data + row*model.vocab.n_tokens();
}
} catch (const std::exception & err) {
// fallback to full vocab list
size_t llama_context::get_sampled_candidates_count(int32_t idx) {
output_reorder();
- if (sampling.candidates == nullptr) {
+ if (!sampling.candidates.has_data()) {
return 0;
}
size_t llama_context::get_sampled_logits_count(int32_t idx) {
output_reorder();
- if (sampling.logits == nullptr) {
+ if (!sampling.logits.has_data()) {
return model.vocab.n_tokens();
}
size_t llama_context::get_sampled_probs_count(int32_t idx) {
output_reorder();
- if (sampling.probs == nullptr) {
+ if (!sampling.probs.has_data()) {
return 0;
}
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
// extract logits
- if (logits && t_logits) {
+ if (logits.data && t_logits) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
- GGML_ASSERT(logits != nullptr);
+ GGML_ASSERT(logits.data != nullptr);
- ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
+ ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_vocab*sizeof(float));
}
// extract embeddings
- if (embd && t_embd) {
+ if (embd.data && t_embd) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
- GGML_ASSERT(embd != nullptr);
+ GGML_ASSERT(embd.data != nullptr);
const uint32_t n_embd_out = hparams.n_embd_out();
- GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
- ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+ GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float));
} break;
case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
cross.n_embd = t_embd->ne[0];
cross.n_enc = t_embd->ne[1];
cross.v_embd.resize(cross.n_embd*cross.n_enc);
- memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd));
+ memcpy(cross.v_embd.data(), embd.data, ggml_nbytes(t_embd));
const auto & batch = balloc->get_batch();
static void copy_tensor_async_ints(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
- llama_token * sampled,
- size_t sampled_size,
+ const buffer_view<llama_token> & sampled,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
- if (sampled == nullptr) {
+ if (!sampled.has_data()) {
return;
}
}
const uint32_t row = it->second;
- GGML_ASSERT(row < sampled_size);
+ GGML_ASSERT(row < sampled.size);
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
- ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
+ ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
}
}
static void copy_tensor_async_floats(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
- float * dst,
+ const buffer_view<float> & dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
- if (dst == nullptr) {
+ if (!dst.has_data()) {
return;
}
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
- float * row_ptr = dst + (size_t) row * stride;
+ float * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of logits/probabilities that were written for this row.
static void copy_tensor_async_candidates(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
- llama_token * dst,
+ const buffer_view<llama_token> & dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
- if (dst == nullptr) {
+ if (!dst.has_data()) {
return;
}
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
- llama_token * row_ptr = dst + (size_t) row * stride;
+ llama_token * row_ptr = dst.data + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of candidates that were written.
}
// extract logits
- if (logits && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
+ if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
- GGML_ASSERT(logits != nullptr);
+ GGML_ASSERT(logits.data != nullptr);
- float * logits_out = logits + n_outputs_prev*n_vocab;
+ float * logits_out = logits.data + n_outputs_prev*n_vocab;
if (n_outputs) {
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
- GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
+ GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size);
ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
}
}
// extract embeddings
- if (embd && t_embd && n_outputs > 0) {
+ if (embd.data && t_embd && n_outputs > 0) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
- GGML_ASSERT(embd != nullptr);
+ GGML_ASSERT(embd.data != nullptr);
const uint32_t n_embd_out = hparams.n_embd_out();
- float * embd_out = embd + n_outputs_prev*n_embd_out;
+ float * embd_out = embd.data + n_outputs_prev*n_embd_out;
if (n_outputs) {
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
- GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
+ GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd.size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
}
} break;
const auto stride = n_vocab;
// async copy the sampling data from the backend to the host
- copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
+ copy_tensor_async_ints(res->t_sampled, sampling.sampled, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
size_t backend_float_count = 0;
size_t backend_token_count = 0;
- logits_size = has_logits ? n_vocab*n_outputs_max : 0;
- embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
+ logits.size = has_logits ? n_vocab*n_outputs_max : 0;
+ embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
// Allocate backend sampling output buffers if there are backend samplers configured.
const bool has_sampling = !sampling.samplers.empty();
if (has_sampling) {
- sampling.logits_size = n_vocab*n_outputs_max;
- sampling.probs_size = n_vocab*n_outputs_max;
- sampling.sampled_size = n_outputs_max;
- sampling.candidates_size = n_vocab*n_outputs_max;
-
- backend_float_count = sampling.logits_size + sampling.probs_size;
- backend_token_count = sampling.sampled_size + sampling.candidates_size;
+ backend_float_count = 2 * n_vocab * n_outputs_max; // logits + probs
+ backend_token_count = (1 + n_vocab) * n_outputs_max; // sampled + candidates
}
if (output_ids.empty()) {
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
const size_t new_size =
- (logits_size + embd_size + backend_float_count) * sizeof(float) +
+ (logits.size + embd.size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
// alloc only when more than the current capacity is required
// TODO: not needed?
buf_output = nullptr;
- logits = nullptr;
- embd = nullptr;
+ logits.data = nullptr;
+ embd.data = nullptr;
}
auto * buft = ggml_backend_cpu_buffer_type();
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
- logits = nullptr;
- embd = nullptr;
-
size_t offset = 0;
uint8_t * base = (uint8_t *) output_base;
- logits = has_logits ? output_base : nullptr;
- offset += logits_size * sizeof(float);
+ logits = has_logits ? buffer_view<float>{output_base, logits.size} : buffer_view<float>{nullptr, 0};
+ offset += logits.size * sizeof(float);
- embd = has_embd ? (float *) (base + offset) : nullptr;
- offset += embd_size * sizeof(float);
+ embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
+ offset += embd.size * sizeof(float);
- sampling.logits = nullptr;
- sampling.probs = nullptr;
- sampling.sampled = nullptr;
- sampling.candidates = nullptr;
+ sampling.logits = {nullptr, 0};
+ sampling.probs = {nullptr, 0};
+ sampling.sampled = {nullptr, 0};
+ sampling.candidates = {nullptr, 0};
if (has_sampling) {
- sampling.logits = (float *) (base + offset);
- offset += sampling.logits_size * sizeof(float);
+ sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.logits.size * sizeof(float);
- sampling.probs = (float *) (base + offset);
- offset += sampling.probs_size * sizeof(float);
+ sampling.probs = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.probs.size * sizeof(float);
- sampling.sampled = (llama_token *) (base + offset);
- offset += sampling.sampled_size * sizeof(llama_token);
+ sampling.sampled = {(llama_token *) (base + offset), (size_t)n_outputs_max};
+ offset += sampling.sampled.size * sizeof(llama_token);
- sampling.candidates = (llama_token *) (base + offset);
- offset += sampling.candidates_size * sizeof(llama_token);
+ sampling.candidates = {(llama_token *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.candidates.size * sizeof(llama_token);
// The count vectors keep track of the actual number of logits/probs/candidates
// copied from the backend for each output row.
std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
- std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
+ std::fill_n(sampling.sampled.data, sampling.sampled.size, LLAMA_TOKEN_NULL);
}
// set all ids as invalid (negative)
const uint64_t i0 = output_swaps[s].i0;
const uint64_t i1 = output_swaps[s].i1;
- if (logits_size > 0) {
+ if (logits.size > 0) {
for (uint64_t k = 0; k < n_vocab; k++) {
- std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
+ std::swap(logits.data[i0*n_vocab + k], logits.data[i1*n_vocab + k]);
}
}
- if (embd_size > 0) {
+ if (embd.size > 0) {
for (uint64_t k = 0; k < n_embd; k++) {
- std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
+ std::swap(embd.data[i0*n_embd + k], embd.data[i1*n_embd + k]);
}
}
- if (sampling.logits && sampling.logits_size > 0) {
+ if (sampling.logits.has_data()) {
for (uint64_t k = 0; k < n_vocab; ++k) {
- std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
+ std::swap(sampling.logits.data[i0*n_vocab + k], sampling.logits.data[i1*n_vocab + k]);
}
}
- if (sampling.probs && sampling.probs_size > 0) {
+ if (sampling.probs.has_data()) {
for (uint64_t k = 0; k < n_vocab; ++k) {
- std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
+ std::swap(sampling.probs.data[i0*n_vocab + k], sampling.probs.data[i1*n_vocab + k]);
}
}
- if (sampling.candidates && sampling.candidates_size > 0) {
+ if (sampling.candidates.has_data()) {
for (uint64_t k = 0; k < n_vocab; ++k) {
- std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
+ std::swap(sampling.candidates.data[i0*n_vocab + k], sampling.candidates.data[i1*n_vocab + k]);
}
}
- if (sampling.sampled && sampling.sampled_size > 0) {
- std::swap(sampling.sampled[i0], sampling.sampled[i1]);
+ if (sampling.sampled.has_data()) {
+ std::swap(sampling.sampled.data[i0], sampling.sampled.data[i1]);
}
if (!sampling.logits_count.empty()) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens());
io.write(&logits_size, sizeof(logits_size));
if (logits_size) {
- io.write(logits, logits_size * sizeof(float));
+ io.write(logits.data, logits_size * sizeof(float));
}
}
{
LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
- const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
+ const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd);
io.write(&embd_size, sizeof(embd_size));
if (embd_size) {
- io.write(embd, embd_size * sizeof(float));
+ io.write(embd.data, embd_size * sizeof(float));
}
}
uint64_t logits_size;
io.read_to(&logits_size, sizeof(logits_size));
- if (this->logits_size < logits_size) {
+ if (this->logits.size < logits_size) {
throw std::runtime_error("logits buffer too small");
}
if (logits_size) {
- io.read_to(this->logits, logits_size * sizeof(float));
+ io.read_to(this->logits.data, logits_size * sizeof(float));
}
}
uint64_t embd_size;
io.read_to(&embd_size, sizeof(embd_size));
- if (this->embd_size < embd_size) {
+ if (this->embd.size < embd_size) {
throw std::runtime_error("embeddings buffer too small");
}
if (embd_size) {
- io.read_to(this->embd, embd_size * sizeof(float));
+ io.read_to(this->embd.data, embd_size * sizeof(float));
}
}