From: Daniel Bevenius Date: Fri, 6 Feb 2026 06:26:54 +0000 (+0100) Subject: llama : rename llama-sampling to llama-sampler (#19363) X-Git-Tag: upstream/0.0.8067~114 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e696cfc0168ba9616a8bd7d09d6284a1b0fec82b;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : rename llama-sampling to llama-sampler (#19363) This commit addresses the TODO in llama-sampling.h to rename that header and the implementation to llama-sampler. --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f337afd6b..bedfa1bc3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,7 +31,7 @@ add_library(llama llama-model-saver.cpp llama-model.cpp llama-quant.cpp - llama-sampling.cpp + llama-sampler.cpp llama-vocab.cpp unicode-data.cpp unicode.cpp diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index 64ea2fd00..2d55070ce 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -2,7 +2,7 @@ #include "llama-impl.h" #include "llama-vocab.h" -#include "llama-sampling.h" +#include "llama-sampler.h" #include #include diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp new file mode 100644 index 000000000..9bbc5dbde --- /dev/null +++ b/src/llama-sampler.cpp @@ -0,0 +1,3885 @@ +#include "llama-sampler.h" + +#include "llama-impl.h" +#include "llama-vocab.h" +#include "llama-grammar.h" + +#include "ggml-cpp.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// the ring buffer works similarly to std::deque, but with a fixed capacity +template +struct ring_buffer { + ring_buffer(size_t cap) : capacity(cap), data(cap) {} + + T & front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + const T & front() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + T & back() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + const T & back() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + void push_back(const T & value) { + if (capacity == 0) { + throw std::runtime_error("ring buffer: capacity is zero"); + } + + if (sz == capacity) { + // advance the start when buffer is full + first = (first + 1) % capacity; + } else { + sz++; + } + data[pos] = value; + pos = (pos + 1) % capacity; + } + + T pop_front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + T value = data[first]; + first = (first + 1) % capacity; + sz--; + return value; + } + + //T & operator[](size_t i) { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + //const T & at(size_t i) const { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + const T & rat(size_t i) const { + if (i >= sz) { + throw std::runtime_error("ring buffer: index out of bounds"); + } + return data[(first + sz - i - 1) % capacity]; + } + + std::vector to_vector() const { + std::vector result; + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(data[(first + i) % capacity]); + } + return result; + } + + void clear() { + // here only reset the status of the buffer + sz = 0; + first = 0; + pos = 0; + } + + bool empty() const { + return sz == 0; + } + + size_t size() const { + return sz; + } + + size_t capacity = 0; + size_t sz = 0; + size_t first = 0; + size_t pos = 0; + + std::vector data; +}; + +// writes result in res, does not mutate cur +static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector & res) { + static const auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + + constexpr int nbuckets = 128; + constexpr float bucket_low = -10.0f; + constexpr float bucket_high = 10.0f; + constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); + constexpr float bucket_inter = -bucket_low * bucket_scale; + + std::vector bucket_idx; + std::vector histo(nbuckets, 0); + + std::vector bucket_ptrs; + + bucket_idx.reserve(cur.size); + + for (int i = 0; i < (int)cur.size; ++i) { + const float val = cur.data[i].logit; + int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + ib = std::max(0, std::min(nbuckets - 1, ib)); + bucket_idx.push_back(ib); + ++histo[ib]; + } + int nhave = 0; + int ib = nbuckets - 1; + for ( ; ib >= 0; --ib) { + nhave += histo[ib]; + if (nhave >= npartial) { + break; + } + } + res.resize(nhave); + auto * ptr = res.data(); + bucket_ptrs.reserve(nbuckets - ib); + for (int j = nbuckets - 1; j >= ib; --j) { + bucket_ptrs.push_back(ptr); + ptr += histo[j]; + } + for (int i = 0; i < (int)cur.size; ++i) { + int j = bucket_idx[i]; + if (j >= ib) { + *bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i]; + } + } + + ptr = res.data(); + int ndone = 0; + for (int j = nbuckets - 1; j > ib; --j) { + std::sort(ptr, ptr + histo[j], comp); + ptr += histo[j]; + ndone += histo[j]; + } + std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp); +} + +// reduces the size of cur_p to npartial, keeping only the top npartial elements +static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) { + static const auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + + if (npartial <= 128) { + std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp); + + cur_p->size = npartial; + cur_p->sorted = true; + + return; + } + + std::vector tmp; + + llama_token_data_array_partial_sort(*cur_p, npartial, tmp); + + std::copy(tmp.data(), tmp.data() + npartial, cur_p->data); + + cur_p->size = npartial; + cur_p->sorted = true; +} + +static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { + // iterator for the probabilities +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif + + struct probs_iterator { + typedef std::input_iterator_tag iterator_category; + typedef float value_type; + typedef float * pointer; + typedef float & reference; + typedef ptrdiff_t difference_type; + + const llama_token_data * data; + + bool operator==(const probs_iterator & other) const { return data == other.data; } + bool operator!=(const probs_iterator & other) const { return data != other.data; } + const float & operator*() const { return data->p; } + probs_iterator & operator++() { ++data; return *this; } + probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; } + }; + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif + + std::discrete_distribution dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size}); + + return dist(rng); +} + +/* +static void llama_log_softmax(float * array, size_t size) { + float max_l = *std::max_element(array, array + size); + float sum = 0.f; + for (size_t i = 0; i < size; ++i) { + float p = expf(array[i] - max_l); + sum += p; + array[i] = p; + } + + for (size_t i = 0; i < size; ++i) { + array[i] = logf(array[i] / sum); + } +} +*/ + +static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { + if (temp <= 0.0f) { + // find the token with the highest logit and set the rest to -inf + size_t max_i = 0; + float max_l = cur_p->data[0].logit; + + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i ].logit > max_l) { + cur_p->data[max_i].logit = -INFINITY; + max_i = i; + max_l = cur_p->data[i].logit; + } else { + cur_p->data[i].logit = -INFINITY; + } + } + + return; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= temp; + } +} + +static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) { + GGML_ASSERT(cur_p->size > 0); + + // Sort the logits in descending order if requested + if (do_sort && !cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); + } + + float max_l = cur_p->data[0].logit; + if (!cur_p->sorted) { + for (size_t i = 1; i < cur_p->size; ++i) { + max_l = std::max(max_l, cur_p->data[i].logit); + } + } + + float cum_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + cum_sum += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum; + } +} + +static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) { + // if (k >= (int32_t)cur_p->size) { + // return; + // } + + if (k <= 0) { + return; + } + + k = std::min(k, (int) cur_p->size); + + // Sort scores in descending order + if (!cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, k); + } + + cur_p->size = k; +} + +static uint32_t get_rng_seed(uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + // use system clock if std::random_device is not a true RNG + static bool is_rd_prng = std::random_device().entropy() == 0; + if (is_rd_prng) { + return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count(); + } + std::random_device rd; + return rd(); + } + return seed; +} + +// llama_sampler API + +struct llama_sampler * llama_sampler_init( + struct llama_sampler_i * iface, + llama_sampler_context_t ctx) { + return new llama_sampler { + /* .iface = */ iface, + /* .ctx = */ ctx, + }; +} + +const char * llama_sampler_name(const struct llama_sampler * smpl) { + if (!smpl->iface) { + return "(null)"; + } + + return smpl->iface->name(smpl); +} + +void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) { + if (!smpl) { + return; + } + + if (smpl->iface->accept) { + smpl->iface->accept(smpl, token); + } +} + +void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) { + if (!smpl) { + return; + } + + GGML_ASSERT(smpl->iface->apply); + smpl->iface->apply(smpl, cur_p); +} + +void llama_sampler_reset(struct llama_sampler * smpl) { + if (!smpl) { + return; + } + + if (smpl->iface->reset) { + smpl->iface->reset(smpl); + } +} + +struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { + if (!smpl) { + return nullptr; + } + + if (smpl->iface->clone) { + return smpl->iface->clone(smpl); + } + + if (smpl->ctx == nullptr) { + return llama_sampler_init( + /* .iface = */ smpl->iface, + /* .ctx = */ nullptr + ); + } + + GGML_ABORT("the sampler does not support cloning"); +} + +void llama_sampler_free(struct llama_sampler * smpl) { + if (smpl == nullptr) { + return; + } + + if (smpl->iface->free) { + smpl->iface->free(smpl); + } + + delete smpl; +} + +// empty sampler + +struct llama_sampler_empty { + const char * name; +}; + +static struct llama_sampler * llama_sampler_init_empty(const char * name); + +static const char * llama_sampler_empty_name(const struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_empty *) smpl->ctx; + return ctx->name; +} + +static void llama_sampler_empty_accept(struct llama_sampler * smpl, llama_token token) { + GGML_UNUSED(smpl); + GGML_UNUSED(token); +} + +static void llama_sampler_empty_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + GGML_UNUSED(smpl); + GGML_UNUSED(cur_p); +} + +static void llama_sampler_empty_reset(struct llama_sampler * smpl) { + GGML_UNUSED(smpl); +} + +static struct llama_sampler * llama_sampler_empty_clone(const struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_empty *) smpl->ctx; + return llama_sampler_init_empty(ctx->name); +} + +static void llama_sampler_empty_free(struct llama_sampler * smpl) { + delete (llama_sampler_empty *) smpl->ctx; +} + +static bool llama_sampler_empty_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + GGML_UNUSED(smpl); + GGML_UNUSED(buft); + + return true; +} + +static void llama_sampler_empty_backend_accept( + struct llama_sampler * smpl, + ggml_context * ctx, + ggml_cgraph * gf, + struct ggml_tensor * selected_token) { + GGML_UNUSED(smpl); + GGML_UNUSED(ctx); + GGML_UNUSED(gf); + GGML_UNUSED(selected_token); +} + +static void llama_sampler_empty_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(smpl); + GGML_UNUSED(ctx); + GGML_UNUSED(gf); + GGML_UNUSED(data); +} + +static void llama_sampler_empty_backend_set_input(struct llama_sampler * smpl) { + GGML_UNUSED(smpl); +} + +static struct llama_sampler_i llama_sampler_empty_i = { + /* .name = */ llama_sampler_empty_name, + /* .accept = */ llama_sampler_empty_accept, + /* .apply = */ llama_sampler_empty_apply, + /* .reset = */ llama_sampler_empty_reset, + /* .clone = */ llama_sampler_empty_clone, + /* .free = */ llama_sampler_empty_free, + /* .backend_init = */ llama_sampler_empty_backend_init, + /* .backend_accept = */ llama_sampler_empty_backend_accept, + /* .backend_apply = */ llama_sampler_empty_backend_apply, + /* .backend_set_input = */ llama_sampler_empty_backend_set_input, +}; + +struct llama_sampler * llama_sampler_init_empty(const char * name) { + return llama_sampler_init( + /* .iface = */ &llama_sampler_empty_i, + /* .ctx = */ new llama_sampler_empty { + /* .name = */ name, + } + ); +} + +// common backend sampler functionality +// +// +name : means that the sampler is support and will run on the backend +// -name : means that a ggml operator is not supported by the backend +// +struct llama_sampler_backend { + llama_sampler_backend(const char * name) : name(name), name_ext(name), is_init(false), support(false) {} + + const char * get_name() { + if (!is_init) { + return name.c_str(); + } + + if (support) { + name_ext = "+" + name; + } else { + name_ext = "-" + name; + } + + return name_ext.c_str(); + } + + void init(bool support) { + GGML_ASSERT(this->is_init == false); + + this->is_init = true; + this->support = support; + } + +private: + std::string name; + std::string name_ext; + + bool is_init; + bool support; +}; + +// check if all ggml ops used by the sampler are supported by the backend +static bool llama_sampler_backend_support( + llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * device = ggml_backend_buft_get_device(buft); + if (!device) { + // CPU backend always supported + return true; + } + + ggml_init_params params = { + /*.mem_size =*/ 128*ggml_tensor_overhead() + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { + throw std::runtime_error(format("failed to create ggml context")); + } + + ggml_context * ctx = ctx_ptr.get(); + + const int64_t n = 1024*1024; + + llama_sampler_data data = { + /*.logits = */ ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n), + /*.probs = */ nullptr, + /*.sampled = */ nullptr, + /*.candidates = */ ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n), + }; + + ggml_cgraph * gf = ggml_new_graph(ctx); + + smpl->iface->backend_apply(smpl, ctx, gf, &data); + + if (data.logits) { + ggml_build_forward_expand(gf, data.logits); + } + + if (data.probs) { + ggml_build_forward_expand(gf, data.probs); + } + + if (data.sampled) { + ggml_build_forward_expand(gf, data.sampled); + } + + if (data.candidates) { + ggml_build_forward_expand(gf, data.candidates); + } + + for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { + struct ggml_tensor * op = ggml_graph_node(gf, i); + + if (!ggml_backend_dev_supports_op(device, op)) { + LLAMA_LOG_WARN("%s: device '%s' does not have support for op %s needed for sampler '%s'\n", + __func__, ggml_backend_dev_name(device), ggml_op_name(op->op), smpl->iface->name(smpl)); + + return false; + } + } + + return true; +} + +// sampler chain + +static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { + return "chain"; +} + +static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto & smpl : chain->samplers) { + llama_sampler_accept(smpl.ptr, token); + } + + chain->n_sample++; +} + +static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + bool is_backend = chain->is_init; + + for (auto & smpl : chain->samplers) { + if (is_backend && smpl.is_backend) { + continue; + } + + is_backend = false; + + if (smpl.ptr->iface->apply == nullptr) { + continue; + } + + llama_sampler_apply(smpl.ptr, cur_p); + } +} + +static void llama_sampler_chain_reset(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto & smpl : chain->samplers) { + llama_sampler_reset(smpl.ptr); + } +} + +static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { + const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; + + auto * result = llama_sampler_chain_init(chain_src->params); + + for (const auto & smpl : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(smpl.ptr)); + } + + return result; +} + +static void llama_sampler_chain_free(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto & smpl : chain->samplers) { + llama_sampler_free(smpl.ptr); + } + + delete chain; +} + +static bool llama_sampler_chain_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + GGML_ASSERT(chain->is_init == false && "llama_sampler_chain_backend_init() called twice"); + + chain->is_init = true; + + bool res = true; + + for (auto & smpl : chain->samplers) { + bool res_cur = true; + + // to be able to run a sampler on the backend, it has to: + // - have the .backend_init() API implemented + // - return true during .backend_init() + if (smpl.ptr->iface->backend_init) { + if (!smpl.ptr->iface->backend_init(smpl.ptr, buft)) { + res_cur = false; + } + } else { + res_cur = false; + } + + smpl.is_backend = res_cur; + + res = res && res_cur; + } + + return res; +} + +static void llama_sampler_chain_backend_accept( + struct llama_sampler * smpl, + ggml_context * ctx, + ggml_cgraph * gf, + struct ggml_tensor * selected_token) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto & smpl : chain->samplers) { + if (!smpl.is_backend) { + break; + } + + if (smpl.ptr->iface->backend_accept) { + smpl.ptr->iface->backend_accept(smpl.ptr, ctx, gf, selected_token); + } + } +} + +static void llama_sampler_chain_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + GGML_ASSERT(chain->is_init && "llama_sampler_chain_backend_init() not called"); + + for (auto & smpl : chain->samplers) { + if (!smpl.is_backend) { + break; + } + + if (smpl.ptr->iface->backend_apply) { + smpl.ptr->iface->backend_apply(smpl.ptr, ctx, gf, data); + } + } +} + +static void llama_sampler_chain_backend_set_input(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto & smpl : chain->samplers) { + if (!smpl.is_backend) { + break; + } + + if (smpl.ptr->iface->backend_set_input) { + smpl.ptr->iface->backend_set_input(smpl.ptr); + } + } +} + +static struct llama_sampler_i llama_sampler_chain_i = { + /* .name = */ llama_sampler_chain_name, + /* .accept = */ llama_sampler_chain_accept, + /* .apply = */ llama_sampler_chain_apply, + /* .reset = */ llama_sampler_chain_reset, + /* .clone = */ llama_sampler_chain_clone, + /* .free = */ llama_sampler_chain_free, + /* .backend_init = */ llama_sampler_chain_backend_init, + /* .backend_accept = */ llama_sampler_chain_backend_accept, + /* .backend_apply = */ llama_sampler_chain_backend_apply, + /* .backend_set_input = */ llama_sampler_chain_backend_set_input, +}; + +struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { + return llama_sampler_init( + /* .iface = */ &llama_sampler_chain_i, + /* .ctx = */ new llama_sampler_chain { + /* .params = */ params, + /* .is_init = */ false, + /* .samplers = */ {}, + /* .cur = */ {}, + /* .t_sample_us = */ 0, + /* .n_sample = */ 0, + } + ); +} + +llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { + const llama_token sampled_token = llama_get_sampled_token_ith (ctx, idx); + const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx); + const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx); + const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx); + + // If a backend sampler has already sampled a token, return it. + if (sampled_token != LLAMA_TOKEN_NULL) { + LLAMA_LOG_DEBUG("%s: Backend sampler selected token for idx %d. Skipping CPU samplers\n", __func__, idx); + return sampled_token; + } + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_vocab = llama_vocab_n_tokens(vocab); + + // use pre-allocated buffer from chain if available, otherwise allocate locally + std::vector * cur_ptr; + std::vector cur_local; + + if (smpl->iface == &llama_sampler_chain_i) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + cur_ptr = &chain->cur; + } else { + cur_ptr = &cur_local; + } + + auto & cur = *cur_ptr; + + if (sampled_probs) { + const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx); + cur.resize(sampled_probs_count); + for (uint32_t i = 0; i < sampled_probs_count; ++i) { + cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]}; + } + } else if (sampled_logits) { + const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx); + cur.resize(sampled_logits_count); + for (llama_token i = 0; i < (int)sampled_logits_count; i++) { + cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f}; + } + } else { + const auto * logits = llama_get_logits_ith(ctx, idx); + GGML_ASSERT(logits != nullptr); + cur.resize(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + } + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; +} + + +void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { + auto * p = (llama_sampler_chain *) chain->ctx; + p->samplers.push_back({ + /* .is_backend = */ false, + /* .ptr = */ smpl, + }); +} + +struct llama_sampler * llama_sampler_chain_get(struct llama_sampler * chain, int32_t i) { + if (chain == nullptr) { + return nullptr; + } + + if (chain->iface != &llama_sampler_chain_i) { + return nullptr; + } + + if (i == -1) { + return chain; + } + + const auto * p = (const llama_sampler_chain *) chain->ctx; + + if (i < 0 || (size_t) i >= p->samplers.size()) { + return nullptr; + } + + return p->samplers[i].ptr; +} + +struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) { + auto * p = (llama_sampler_chain *) chain->ctx; + + if (i < 0 || (size_t) i >= p->samplers.size()) { + return nullptr; + } + + auto * result = p->samplers[i].ptr; + p->samplers.erase(p->samplers.begin() + i); + + return result; +} + +int llama_sampler_chain_n(const struct llama_sampler * chain) { + const auto * p = (const llama_sampler_chain *) chain->ctx; + + return p->samplers.size(); +} + +// +// samplers +// + +// greedy + +struct llama_sampler_greedy : public llama_sampler_backend { +}; + +static const char * llama_sampler_greedy_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_greedy *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_greedy_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_greedy *) smpl->ctx; + GGML_UNUSED(ctx); +} + +static struct llama_sampler * llama_sampler_greedy_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_greedy *) smpl->ctx; + auto * result = llama_sampler_init_greedy(); + + // copy the state + { + auto * result_ctx = (llama_sampler_greedy *) result->ctx; + + GGML_UNUSED(ctx); + GGML_UNUSED(result_ctx); + } + + return result; +} + +static void llama_sampler_greedy_free(struct llama_sampler * smpl) { + delete (llama_sampler_greedy *) smpl->ctx; +} + +static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { + cur_p->selected = 0; + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) { + cur_p->selected = i; + } + } +} + +static bool llama_sampler_greedy_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_greedy *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_greedy_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(gf); + GGML_UNUSED(smpl); + + struct ggml_tensor * curl = ggml_argmax(ctx, data->logits); + ggml_set_name(curl, "greedy_argmax"); + + data->sampled = curl; +} + +static struct llama_sampler_i llama_sampler_greedy_i = { + /* .name = */ llama_sampler_greedy_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_greedy_apply, + /* .reset = */ llama_sampler_greedy_reset, + /* .clone = */ llama_sampler_greedy_clone, + /* .free = */ llama_sampler_greedy_free, + /* .backend_init = */ llama_sampler_greedy_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_greedy_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_greedy() { + return llama_sampler_init( + /* .iface = */ &llama_sampler_greedy_i, + /* .ctx = */ new llama_sampler_greedy { + ("greedy"), + } + ); +} + +// dist + +struct llama_sampler_dist : public llama_sampler_backend { + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; + + ggml_tensor * inp_uniform; +}; + +static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + + // edge cases + if (cur_p->size == 0) { + cur_p->selected = -1; + return; + } + + cur_p->selected = 0; + + if (cur_p->size == 1) { + cur_p->data[0].p = 1.0f; + return; + } + + // max logit for numerical stability + float max_l = cur_p->data[0].logit; + if (!cur_p->sorted) { + for (size_t i = 1; i < cur_p->size; ++i) { + max_l = std::max(max_l, cur_p->data[i].logit); + } + } + + // apply softmax to obtain the probabilities + double sum_cum = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + sum_cum += p; + } + +#if 1 + // sample from the obtained probabilities and normalize the probs in a single pass + // this is ~3x faster on Mac with full gpt-oss vocab than the version below + // + std::uniform_real_distribution dist(0.0f, 1.0f); + const double rnd = dist(ctx->rng); + + double sum_run = 0.0f; + const double sum_tgt = sum_cum*rnd; + + bool found = false; + for (size_t i = 0; i < cur_p->size; ++i) { + if (!found) { + // accumulate probs until we reach the target sum + sum_run += cur_p->data[i].p; + if (sum_run >= sum_tgt) { + cur_p->selected = i; + found = true; + } + } + + // normalize probs + cur_p->data[i].p /= sum_cum; + } + + // fallback to the last token (don't think this can happen) + assert(found); + if (!found) { + cur_p->selected = cur_p->size - 1; + } +#else + // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= sum_cum; + } + + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); +#endif +} + +static void llama_sampler_dist_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_dist *) smpl->ctx; + auto * result = llama_sampler_init_dist(ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_dist *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_dist_free(struct llama_sampler * smpl) { + delete (llama_sampler_dist *) smpl->ctx; +} + +static bool llama_sampler_dist_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_dist *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_dist_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(gf); + + auto * sctx = (llama_sampler_dist *) smpl->ctx; + + sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + ggml_set_name (sctx->inp_uniform, "uniform"); + ggml_set_input(sctx->inp_uniform); + + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); + ggml_set_name(probs, "dist_probs"); + + struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); + ggml_set_name(cumsum, "dist_cumsum"); + + // The uniform tensor has a random value and we subtract this tensor with + // the cumsum tensor (the uniform tensor will be broadcasted by ggml_sub). + // Recall that each entry in cumsum is the cumulative probability up to that + // index so values stay negative while the cumulative total is below the + // random value, and become zero/positive once the threshold is crossed. + struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); + ggml_set_name(diff, "dist_cumsum"); + + // The ggml_step function produces a tensor where entries are 1 if the + // corresponding entry in diff is > 0, and 0 otherwise. So all values up to + // the index where the cumulative probability exceeds the random value are 0, + // and all entries after that are 1. + struct ggml_tensor * mask = ggml_step(ctx, diff); + ggml_set_name(mask, "dist_mask"); + + // Taking the sum of the mask gives us the sum of elements after the threshold + // we are interested in. + struct ggml_tensor * idxf = ggml_sum(ctx, mask); + ggml_set_name(idxf, "dist_index_f32"); + + // Use ggml_scale_bias to scale the index value by -1 and then add the size + // of the mask to that value so we get the correct index ((-1 * idxf) + n). + struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); + ggml_set_name(idx, "dist_index_i32"); + + // Map back to original vocab ids if a candidates tensor is available. + struct ggml_tensor * sampled_token = idx; + if (data->candidates != nullptr) { + struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); + + sampled_token = ggml_get_rows(ctx, candidates, idx); + ggml_set_name(sampled_token, "dist_sampled_token"); + } + + data->sampled = sampled_token; + data->probs = probs; +} + +static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist *) smpl->ctx; + + GGML_ASSERT(sctx->inp_uniform != nullptr); + + // We sample in double precision and cast to float to match rnd numbers of + // llama_dampler_dist which uses double precision (sampling from + // std::uniform_real_distribution and + // std::uniform_real_distribution with same rng will produce + // different sequences). + std::uniform_real_distribution dist(0.0f, 1.0f); + const float rnd = dist(sctx->rng); + + ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); +} + +static struct llama_sampler_i llama_sampler_dist_i = { + /* .name = */ llama_sampler_dist_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_dist_apply, + /* .reset = */ llama_sampler_dist_reset, + /* .clone = */ llama_sampler_dist_clone, + /* .free = */ llama_sampler_dist_free, + /* .backend_init = */ llama_sampler_dist_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_dist_backend_apply, + /* .backend_set_input = */ llama_sampler_dist_backend_set_input, +}; + +struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_dist_i, + /* .ctx = */ new llama_sampler_dist { + ("dist"), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .inp_uniform = */ nullptr, + } + ); +} + +// top-k + +struct llama_sampler_top_k : public llama_sampler_backend { + const int32_t k; +}; + +static const char * llama_sampler_top_k_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_top_k *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_top_k *) smpl->ctx; + llama_sampler_top_k_impl(cur_p, ctx->k); +} + +static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_k *) smpl->ctx; + return llama_sampler_init_top_k(ctx->k); +} + +static void llama_sampler_top_k_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_k *) smpl->ctx; +} + +static bool llama_sampler_top_k_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_top_k *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_top_k_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * sctx = (llama_sampler_top_k *) smpl->ctx; + + struct ggml_tensor * top_k = ggml_top_k(ctx, data->logits, sctx->k); + ggml_set_name(top_k, "top_k"); + + if (data->candidates) { + struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]); + data->candidates = ggml_get_rows(ctx, candidates_rows, top_k); + data->candidates = ggml_reshape_1d(ctx, data->candidates, sctx->k); + ggml_set_name(data->candidates, "top_k_candidates"); + } else { + data->candidates = top_k; + } + + struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); + struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, logits_rows, top_k); + data->logits = ggml_reshape_1d(ctx, top_k_rows, sctx->k); + ggml_set_name(top_k_rows, "top_k_rows"); + + GGML_UNUSED(gf); +} + +static struct llama_sampler_i llama_sampler_top_k_i = { + /* .name = */ llama_sampler_top_k_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_k_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_k_clone, + /* .free = */ llama_sampler_top_k_free, + /* .backend_init = */ llama_sampler_top_k_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_top_k_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_top_k(int32_t k) { + const bool is_empty = (k <= 0); + + if (is_empty) { + return llama_sampler_init_empty("?top-k"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_top_k_i, + /* .ctx = */ new llama_sampler_top_k { + ("top-k"), + /* .k = */ k, + } + ); +} + +// top-p + +struct llama_sampler_top_p : public llama_sampler_backend { + const float p; + const size_t min_keep; + + std::vector buf_sort; +}; + +static const char * llama_sampler_top_p_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_top_p *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_top_p *) smpl->ctx; + + if (ctx->p >= 1.0f) { + return; + } + + llama_sampler_softmax_impl(cur_p, false); + + size_t k = cur_p->size; + auto * pdata = cur_p->data; + + auto & buf_sort = ctx->buf_sort; + + // if not sorted, try adaptive top-k sorting + if (!cur_p->sorted && cur_p->size > 1024) { + k = std::min(256, cur_p->size); + llama_token_data_array_partial_sort(*cur_p, k, buf_sort); + pdata = buf_sort.data(); + } else if (!cur_p->sorted) { + // small candidates -> sort inplace + llama_token_data_array_partial_sort_inplace(cur_p, k); + } + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = cur_p->size; + + for (size_t i = 0; i < cur_p->size; ++i) { + cum_sum += pdata[i].p; + + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) { + last_idx = i + 1; + break; + } + + // we exceeded the current top-k heuristic -> increase k and continue + if (!cur_p->sorted && i == k - 1) { + k = cur_p->size; + llama_token_data_array_partial_sort(*cur_p, k, buf_sort); + pdata = buf_sort.data(); + } + } + + // Resize the output vector to keep only the top-p tokens + if (!cur_p->sorted) { + std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data); + cur_p->sorted = true; + } + + cur_p->size = last_idx; +} + +static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_p *) smpl->ctx; + return llama_sampler_init_top_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_top_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_p *) smpl->ctx; +} + +static bool llama_sampler_top_p_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_top_p *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_top_p_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * sctx = (llama_sampler_top_p *) smpl->ctx; + + auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(ggml_nrows(a) == 1); + struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]); + struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b); + return ggml_reshape_1d(ctx, a_sorted, a->ne[0]); + }; + + // Get the sorted logits in descending order. + struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC); + ggml_set_name(sorted_idx, "top_p_sorted_idx"); + + // Do the sorting via reshape + get_rows + struct ggml_tensor * sorted_logits = ggml_sort(data->logits, sorted_idx); + ggml_set_name(sorted_logits, "top_p_sorted_logits"); + + struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits); + ggml_set_name(softmax, "top_p_softmax"); + + // If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates. + if (data->candidates) { + data->candidates = ggml_sort(data->candidates, sorted_idx); + } else { + data->candidates = sorted_idx; + } + ggml_set_name(data->candidates, "top_p_candidates"); + + // Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM. + struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax); + ggml_set_name(cdf, "top_p_cdf"); + + // Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep + struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p); + ggml_set_name(cdf_scaled, "top_p_cdf_scaled"); + + struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled); + ggml_set_name(mask, "top_p_mask"); + + // Taking the sum of the mask gives us the sum of elements after the threshold + // we are interested in. + struct ggml_tensor * idxf = ggml_sum(ctx, mask); + ggml_set_name(idxf, "top_p_index_f32"); + + // prevent out-of-bounds access + idxf = ggml_clamp(ctx, idxf, 0.0f, mask->ne[0] - 1); + + // construct ones tensor to set the value in the mask + struct ggml_tensor * ones = ggml_scale_bias(ctx, idxf, 0.0f, 1.0f); + ggml_set_name(ones, "top_p_ones"); + + // Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p) + struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]); + + mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32)); + mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]); + + // Apply -INFINITY bias for masked-out tokens + // log(1) = 0 (keep), log(0) = -INF (discard) + struct ggml_tensor * top_p_bias = ggml_log(ctx, mask); + ggml_set_name(top_p_bias, "top_p_bias"); + + data->logits = ggml_add(ctx, sorted_logits, top_p_bias); + ggml_set_name(data->logits, "top_p_logits"); + + GGML_UNUSED(gf); +} + +static struct llama_sampler_i llama_sampler_top_p_i = { + /* .name = */ llama_sampler_top_p_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_p_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_p_clone, + /* .free = */ llama_sampler_top_p_free, + /* .backend_init = */ llama_sampler_top_p_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_top_p_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { + const bool is_empty = p >= 1.0f; + + if (is_empty) { + return llama_sampler_init_empty("?top-p"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_top_p_i, + /* .ctx = */ new llama_sampler_top_p { + ("top-p"), + /* .p = */ p, + /* .min_keep = */ min_keep, + /* .buf_sort = */ {}, + } + ); +} + +// min-p + +struct llama_sampler_min_p : public llama_sampler_backend { + const float p; + const size_t min_keep; +}; + +static const char * llama_sampler_min_p_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_min_p *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_min_p *) smpl->ctx; + + if (ctx->p <= 0.0f || !cur_p->size) { + return; + } + + bool min_p_applied = false; + + // if the cur_p aren't sorted, try the unsorted implementation first + if (!cur_p->sorted) { + std::vector filtered_tokens; + + float max_logit = -FLT_MAX; + for (size_t i = 0; i < cur_p->size; ++i) { + max_logit = std::max(max_logit, cur_p->data[i].logit); + } + const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit >= min_logit) { + filtered_tokens.push_back(cur_p->data[i]); + } + } + + // if we have enough values the operation was a success + if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) { + std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data); + cur_p->size = filtered_tokens.size(); + min_p_applied = true; + } + } + + // if the cur_p are sorted or the unsorted implementation failed, use this implementation + if (!min_p_applied) { + // Sort the logits in descending order + if (!cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); + } + + const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max + size_t i = 1; // first token always matches + + for (; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + cur_p->size = i; + } +} + +static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_min_p *) smpl->ctx; + return llama_sampler_init_min_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_min_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_min_p *) smpl->ctx; +} + +static bool llama_sampler_min_p_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_min_p *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_min_p_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * sctx = (llama_sampler_min_p *) smpl->ctx; + + struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits); + ggml_set_name(max_idx, "max_idx"); + + struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); + ggml_set_name(logits_rows, "logits_rows"); + + struct ggml_tensor * max_logit = ggml_get_rows(ctx, logits_rows, max_idx); + ggml_set_name(max_logit, "max_logit"); + + // Calculate the threshold value. + struct ggml_tensor * threshold = ggml_scale_bias(ctx, max_logit, 1.0f, logf(sctx->p)); + ggml_set_name(threshold, "min_p_threshold"); + + // Subtract the threshold from logits. + struct ggml_tensor * sub = ggml_sub(ctx, data->logits, threshold); + + // Create a mask where logits below the threshold are 0 (discard), + // and others are 1 (keep). + struct ggml_tensor * mask = ggml_step(ctx, sub); + ggml_set_name(mask, "min_p_mask"); + + // Apply -INFINITY bias for masked-out tokens + // log(1) = 0 (keep), log(0) = -INF (discard) + struct ggml_tensor * min_p_bias = ggml_log(ctx, mask); + ggml_set_name(min_p_bias, "min_p_bias"); + + data->logits = ggml_add(ctx, data->logits, min_p_bias); + ggml_set_name(data->logits, "min_p_logits"); + + GGML_UNUSED(gf); +} + +static struct llama_sampler_i llama_sampler_min_p_i = { + /* .name = */ llama_sampler_min_p_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_min_p_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_min_p_clone, + /* .free = */ llama_sampler_min_p_free, + /* .backend_init = */ llama_sampler_min_p_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_min_p_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { + const bool is_empty = (p <= 0.0f); + + if (is_empty) { + return llama_sampler_init_empty("?min-p"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_min_p_i, + /* .ctx = */ new llama_sampler_min_p { + ("min-p"), + /* .p = */ p, + /* .min_keep = */ min_keep, + } + ); +} + +// typical + +struct llama_sampler_typical { + const float p; + const size_t min_keep; +}; + +static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) { + return "typical"; +} + +static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_typical *) smpl->ctx; + + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (ctx->p >= 1.0f) { + return; + } + + // Compute the softmax of logits and calculate entropy + llama_sampler_softmax_impl(cur_p, true); + + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < cur_p->size; ++i) { + float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(cur_p->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += cur_p->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector cur_p_new; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + cur_p_new.push_back(cur_p->data[idx]); + } + + // Replace the data in cur_p with the cur_p_new data + std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); + cur_p->size = cur_p_new.size(); + cur_p->sorted = false; +} + +static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_typical *) smpl->ctx; + return llama_sampler_init_typical(ctx->p, ctx->min_keep); +} + +static void llama_sampler_typical_free(struct llama_sampler * smpl) { + delete (llama_sampler_typical *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_typical_i = { + /* .name = */ llama_sampler_typical_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_typical_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_typical_clone, + /* .free = */ llama_sampler_typical_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { + const bool is_empty = (p >= 1.0f); + + if (is_empty) { + return llama_sampler_init_empty("?typical"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_typical_i, + /* .ctx = */ new llama_sampler_typical { + /* .p = */ p, + /* .min_keep = */ min_keep, + } + ); +} + +// temp + +struct llama_sampler_temp : public llama_sampler_backend { + const float temp; +}; + +static const char * llama_sampler_temp_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_temp *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_temp *) smpl->ctx; + + llama_sampler_temp_impl(cur_p, ctx->temp); +} + +static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp *) smpl->ctx; + return llama_sampler_init_temp(ctx->temp); +} + +static void llama_sampler_temp_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp *) smpl->ctx; +} + +static void llama_sampler_backend_temp_sampling( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data, + float temp) { + if (temp <= 0.0f) { + // Find the most probable token index. + struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits); + ggml_set_name(max_idx, "temp_max_idx"); + + if (data->candidates) { + struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]); + data->candidates = ggml_get_rows(ctx, candidates_rows, max_idx); + } else { + data->candidates = max_idx; + } + + struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); + data->logits = ggml_get_rows(ctx, logits_rows, max_idx); + + return; + } + + data->logits = ggml_scale(ctx, data->logits, 1.0f / temp); + + GGML_UNUSED(gf); +} + +static bool llama_sampler_temp_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_temp *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_temp_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * sctx = (llama_sampler_temp *) smpl->ctx; + llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp); +} + +static struct llama_sampler_i llama_sampler_temp_i = { + /* .name = */ llama_sampler_temp_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_temp_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_temp_clone, + /* .free = */ llama_sampler_temp_free, + /* .backend_init = */ llama_sampler_temp_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_temp_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_temp(float temp) { + const bool is_empty = temp == 1.0f; + + if (is_empty) { + return llama_sampler_init_empty("?temp"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_temp_i, + /* .ctx = */ new llama_sampler_temp { + ("temp"), + /*.temp = */ temp, + } + ); +} + +// temp-ext + +struct llama_sampler_temp_ext : public llama_sampler_backend { + const float temp; + const float delta; + const float exponent; +}; + +static const char * llama_sampler_temp_ext_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; + if (ctx->delta > 0) { + const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); + const float max_temp = ctx->temp + ctx->delta; + + float exponent_val = ctx->exponent; + + // no need to do anything if there is only one (or zero) candidates + if (cur_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / cur_p->size); + + llama_sampler_softmax_impl(cur_p, true); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float prob = cur_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + + #ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); + #endif + + // Apply the dynamically calculated temperature scaling + llama_sampler_temp_impl(cur_p, dyn_temp); + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + const double max_l_double = cur_p->data[0].logit; + + double cum_sum_double = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + double p = exp(cur_p->data[i].logit - max_l_double); + cur_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + + #ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < cur_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f); + } + #endif + } else { + llama_sampler_temp_impl(cur_p, ctx->temp); + } +} + +static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx; + return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent); +} + +static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp_ext *) smpl->ctx; +} + +static bool llama_sampler_temp_ext_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_temp_ext_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; + + // Revert to standard temperature scaling if delta or temp are non-positive. + if (sctx->delta <= 0.0f || sctx->temp <= 0.0f) { + llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp); + return; + } + + // Calculate min_temp, max_temp, and max_entropy. + const float min_temp = std::max(0.0f, sctx->temp - sctx->delta); + const float max_temp = sctx->temp + sctx->delta; + const float max_entropy = logf(data->logits->ne[0]); + + // Calculate the probabilities. + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); + ggml_set_name(probs, "temp_ext_softmax_probs"); + + // Clamp probabilities to avoid log(0) which would give -inf + struct ggml_tensor * probs_clamped = ggml_clamp(ctx, probs, 1e-10f, 1.0f); + ggml_set_name(probs_clamped, "temp_ext_probs_clamped"); + + // Calculate the entropy, entropy = -Σ(p * log(p)). + struct ggml_tensor * log_probs = ggml_log(ctx, probs_clamped); + struct ggml_tensor * p_log_p = ggml_mul(ctx, probs_clamped, log_probs); + struct ggml_tensor * sum_p_log_p = ggml_sum(ctx, p_log_p); + struct ggml_tensor * entropy = ggml_scale(ctx, sum_p_log_p, -1.0f); + ggml_set_name(log_probs, "temp_ext_log_probs"); + ggml_set_name(p_log_p, "temp_ext_p_log_p"); + ggml_set_name(sum_p_log_p, "temp_ext_sum_p_log_p"); + ggml_set_name(entropy, "temp_ext_entropy"); + + // Normalize the entropy, norm_entropy = entropy / max_entropy + struct ggml_tensor * norm_entropy = ggml_scale(ctx, entropy, 1.0f / max_entropy); + ggml_set_name(norm_entropy, "temp_ext_norm_entropy"); + + // Calculate the dynamic temperature: + // dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent); + // + // Calculate powf(normalized_entropy, exponent) as + // norm_entropy^exponent = exp(exponent * log(norm_entropy)) + struct ggml_tensor * log_norm_entropy = ggml_log(ctx, norm_entropy); + struct ggml_tensor * scaled_log = ggml_scale(ctx, log_norm_entropy, sctx->exponent); + struct ggml_tensor * pow_entropy = ggml_exp(ctx, scaled_log); + // With pow_entropy computed we can now compute dyn_temp, scaling by + // (max_temp - min_temp) and then adding min_temp. + struct ggml_tensor * dyn_temp = ggml_scale_bias(ctx, pow_entropy, max_temp - min_temp, min_temp); + ggml_set_name(log_norm_entropy, "temp_ext_log_norm_entropy"); + ggml_set_name(scaled_log, "temp_ext_scaled_log"); + ggml_set_name(pow_entropy, "temp_ext_pow_entropy"); + ggml_set_name(dyn_temp, "temp_ext_dyn_temp"); + + // Scale the logits by the dynamic temperature + struct ggml_tensor * scaled_logits = ggml_div(ctx, data->logits, dyn_temp); + ggml_set_name(scaled_logits, "temp_ext_scaled_logits"); + + data->logits = scaled_logits; +} + +static struct llama_sampler_i llama_sampler_temp_ext_i = { + /* .name = */ llama_sampler_temp_ext_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_temp_ext_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_temp_ext_clone, + /* .free = */ llama_sampler_temp_ext_free, + /* .backend_init = */ llama_sampler_temp_ext_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_temp_ext_backend_apply, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { + const bool is_empty = temp == 1.0f && delta <= 0.0f; + + if (is_empty) { + return llama_sampler_init_empty("?temp-ext"); + } + + auto * res = llama_sampler_init( + /* .iface = */ &llama_sampler_temp_ext_i, + /* .ctx = */ new llama_sampler_temp_ext { + ("temp-ext"), + /* .temp = */ temp, + /* .delta = */ delta, + /* .exponent = */ exponent, + } + ); + + return res; +} + +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const size_t min_keep; + + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f + || ctx->threshold > 0.5f + || cur_p->size < 2) { + return; + } + + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(ctx->rng); + if (chance > ctx->probability) { + return; + } + + llama_sampler_softmax_impl(cur_p, true); + + int pos_last = 0; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p >= ctx->threshold) { + pos_last = i; + } else { + break; + } + } + + if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { + cur_p->data += pos_last; + cur_p->size -= pos_last; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_xtc *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ llama_sampler_xtc_reset, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { + const bool is_empty = (p <= 0.0f || t > 0.5f); + + if (is_empty) { + return llama_sampler_init_empty("?xtc"); + } + + const auto seed_cur = get_rng_seed(seed); + + return llama_sampler_init( + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .min_keep = */ min_keep, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + } + ); +} + +// mirostat + +struct llama_sampler_mirostat { + const int32_t n_vocab; + + const uint32_t seed; + uint32_t seed_cur; + + const float tau; + const float eta; + + const int32_t m; + + float mu; + + std::mt19937 rng; +}; + +static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { + return "mirostat"; +} + +static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p, true); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); + + llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); + + llama_sampler_softmax_impl(cur_p, true); + + const int idx = llama_sample_dist(cur_p, ctx->rng); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx; + auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_mirostat_i = { + /* .name = */ llama_sampler_mirostat_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_mirostat_apply, + /* .reset = */ llama_sampler_mirostat_reset, + /* .clone = */ llama_sampler_mirostat_clone, + /* .free = */ llama_sampler_mirostat_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { + const auto seed_cur = get_rng_seed(seed); + + return llama_sampler_init( + /* .iface = */ &llama_sampler_mirostat_i, + /* .ctx = */ new llama_sampler_mirostat { + /* .n_vocab = */ n_vocab, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .m = */ m, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), + } + ); +} + +// mirostat v2 + +struct llama_sampler_mirostat_v2 { + const uint32_t seed; + uint32_t seed_cur; + + const float tau; + const float eta; + + float mu; + + std::mt19937 rng; +}; + +static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { + return "mirostat-v2"; +} + +static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p, true); + + // Truncate the words with surprise values greater than mu + cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { + return -log2f(candidate.p) > ctx->mu; + })); + + if (cur_p->size == 0) { + cur_p->size = 1; + } + + // Normalize the probabilities of the remaining words + llama_sampler_softmax_impl(cur_p, true); + + const int idx = llama_sample_dist(cur_p, ctx->rng); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx; + + auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat_v2 *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_mirostat_v2_i = { + /* .name = */ llama_sampler_mirostat_v2_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_mirostat_v2_apply, + /* .reset = */ llama_sampler_mirostat_v2_reset, + /* .clone = */ llama_sampler_mirostat_v2_clone, + /* .free = */ llama_sampler_mirostat_v2_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_mirostat_v2_i, + /* .ctx = */ new llama_sampler_mirostat_v2 { + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), + } + ); +} + +// grammar + +struct llama_sampler_grammar { + const struct llama_vocab * vocab; + + std::string grammar_str; + std::string grammar_root; + + struct llama_grammar * grammar; +}; + +static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) { + return "grammar"; +} + +static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_accept_impl(*ctx->grammar, token); + } +} + +static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_apply_impl(*ctx->grammar, cur_p); + } +} + +// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle. +static struct llama_sampler * llama_sampler_init_grammar_impl( + const struct llama_vocab * vocab, + const char * grammar_str, + const char * grammar_root, + bool lazy, + const char ** trigger_words, + size_t num_trigger_words, + const llama_token * trigger_tokens, + size_t num_trigger_tokens, + const char ** trigger_patterns, + size_t num_trigger_patterns); + +static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (!ctx->grammar) { + return; + } + + std::vector trigger_patterns_c; + trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size()); + for (auto & trigger_pattern : ctx->grammar->trigger_patterns) { + trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); + } + + auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(), + ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), + ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); + + llama_grammar_free_impl(ctx->grammar); + ctx->grammar = grammar_new; +} + +static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; + + auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0); + GGML_ASSERT(result); + + // copy the state + { + auto * result_ctx = (llama_sampler_grammar *) result->ctx; + + if (ctx->grammar) { + result_ctx->grammar_str = ctx->grammar_str; + result_ctx->grammar_root = ctx->grammar_root; + + result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar); + } + } + + return result; +} + +static void llama_sampler_grammar_free(struct llama_sampler * smpl) { + const auto * ctx = (llama_sampler_grammar *) smpl->ctx; + + if (ctx->grammar) { + llama_grammar_free_impl(ctx->grammar); + } + + delete ctx; +} + +static struct llama_sampler_i llama_sampler_grammar_i = { + /* .name = */ llama_sampler_grammar_name, + /* .accept = */ llama_sampler_grammar_accept_impl, + /* .apply = */ llama_sampler_grammar_apply, + /* .reset = */ llama_sampler_grammar_reset, + /* .clone = */ llama_sampler_grammar_clone, + /* .free = */ llama_sampler_grammar_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +static struct llama_sampler * llama_sampler_init_grammar_impl( + const struct llama_vocab * vocab, + const char * grammar_str, + const char * grammar_root, + bool lazy, + const char ** trigger_words, + size_t num_trigger_words, + const llama_token * trigger_tokens, + size_t num_trigger_tokens, + const char ** trigger_patterns, + size_t num_trigger_patterns) { + auto * ctx = new llama_sampler_grammar; + + if (grammar_str != nullptr && grammar_str[0] != '\0') { + std::string trigger_pattern; + llama_grammar * grammar = nullptr; + // TODO: remove trigger_words support. + if (trigger_words != nullptr && num_trigger_words > 0) { + GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0); + trigger_pattern = "[\\s\\S]*?("; + for (size_t i = 0; i < num_trigger_words; ++i) { + static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); + if (i > 0) { + trigger_pattern += "|"; + } + trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0"); + } + trigger_pattern += ")[\\s\\S]*"; + + std::array tmp_trigger_patterns = { trigger_pattern.c_str() }; + grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens); + } else { + grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens); + } + *ctx = { + /* .vocab = */ vocab, + /* .grammar_str = */ grammar_str, + /* .grammar_root = */ grammar_root, + /* .grammar = */ grammar, + }; + if (!ctx->grammar) { + delete ctx; + return nullptr; + } + } else { + *ctx = { + /* .vocab = */ vocab, + /* .grammar_str = */ {}, + /* .grammar_root = */ {}, + /* .grammar = */ nullptr, + }; + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_grammar_i, + /* .ctx = */ ctx + ); +} + +struct llama_sampler * llama_sampler_init_grammar( + const struct llama_vocab * vocab, + const char * grammar_str, + const char * grammar_root) { + return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0); +} + +struct llama_sampler * llama_sampler_init_grammar_lazy( + const struct llama_vocab * vocab, + const char * grammar_str, + const char * grammar_root, + const char ** trigger_words, + size_t num_trigger_words, + const llama_token * trigger_tokens, + size_t num_trigger_tokens) { + return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0); +} + +struct llama_sampler * llama_sampler_init_grammar_lazy_patterns( + const struct llama_vocab * vocab, + const char * grammar_str, + const char * grammar_root, + const char ** trigger_patterns, + size_t num_trigger_patterns, + const llama_token * trigger_tokens, + size_t num_trigger_tokens) { + return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns); +} + +// penalties + +struct llama_sampler_penalties { + const int32_t penalty_last_n; + const float penalty_repeat; + const float penalty_freq; + const float penalty_present; + + ring_buffer prev; + + // a frequency map to count token occurrences + std::unordered_map token_count; +}; + +static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) { + return "penalties"; +} + +static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + if (ctx->penalty_last_n == 0) { + return; + } + + ctx->token_count[token]++; + + // if the ring buffer is full, remove the oldest token + if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) { + const auto old = ctx->prev.front(); + + ctx->token_count[old]--; + if (ctx->token_count[old] == 0) { + ctx->token_count.erase(old); + } + } + + ctx->prev.push_back(token); + +#if 0 + // sanity check + std::unordered_map tmp; + for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { + tmp[ctx->prev.rat(i)]++; + } + + assert(ctx->token_count == tmp); +#endif +} + +static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + + if ((ctx->penalty_last_n == 0) || + (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { + return; + } + + // Apply frequency and presence penalties to the cur_p + for (size_t i = 0; i < cur_p->size; ++i) { + const auto token_iter = ctx->token_count.find(cur_p->data[i].id); + if (token_iter == ctx->token_count.end()) { + continue; + } + + const int count = token_iter->second; + + assert(count > 0 && count <= ctx->penalty_last_n); + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (cur_p->data[i].logit <= 0) { + cur_p->data[i].logit *= ctx->penalty_repeat; + } else { + cur_p->data[i].logit /= ctx->penalty_repeat; + } + + cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present; + } + + cur_p->sorted = false; +} + +static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + ctx->prev.clear(); + ctx->token_count.clear(); +} + +static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; + auto * result = llama_sampler_init_penalties( + ctx->penalty_last_n, + ctx->penalty_repeat, + ctx->penalty_freq, + ctx->penalty_present); + + // copy the state + { + auto * result_ctx = (llama_sampler_penalties *) result->ctx; + + result_ctx->prev = ctx->prev; + } + + return result; +} + +static void llama_sampler_penalties_free(struct llama_sampler * smpl) { + delete (llama_sampler_penalties *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_penalties_i = { + /* .name = */ llama_sampler_penalties_name, + /* .accept = */ llama_sampler_penalties_accept, + /* .apply = */ llama_sampler_penalties_apply, + /* .reset = */ llama_sampler_penalties_reset, + /* .clone = */ llama_sampler_penalties_clone, + /* .free = */ llama_sampler_penalties_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_penalties( + int32_t penalty_last_n, + float penalty_repeat, + float penalty_freq, + float penalty_present) { + penalty_last_n = std::max(penalty_last_n, 0); + + const bool is_empty = (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)); + + if (is_empty) { + return llama_sampler_init_empty("?penalties"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_penalties_i, + /* .ctx = */ new llama_sampler_penalties { + /* .penalty_last_n = */ penalty_last_n, + /* .penalty_repeat = */ penalty_repeat, + /* .penalty_freq = */ penalty_freq, + /* .penalty_present = */ penalty_present, + /* .prev = */ ring_buffer(penalty_last_n), + /* .token_count = */ {}, + } + ); +} + +// top-n-sigma + +struct llama_sampler_top_n_sigma { + const float n; +}; + +static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) { + return "top-n-sigma"; +} + +static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx; + + if (ctx->n <= 0.0f || cur_p->size <= 1) { + return; + } + + // find max logit and calculate mean + float max = cur_p->data[0].logit; + float logits_sum = 0; + size_t valid_count = 0; + for (size_t i = 0; i < cur_p->size; ++i) { + // Only count non-negative infinity values + if (cur_p->data[i].logit != -INFINITY) { + max = std::max(max, cur_p->data[i].logit); + logits_sum += cur_p->data[i].logit; + valid_count++; + } + } + float mean = valid_count > 0 ? logits_sum/valid_count : 0; + + // calculate standard deviation + float acc = 0; + for (size_t i = 0; i < cur_p->size; ++i) { + // Skip -infinity in std calculation + if (cur_p->data[i].logit != -INFINITY) { + acc += pow(cur_p->data[i].logit - mean, 2); + } + } + float std = valid_count > 0 ? sqrt(acc/valid_count) : 0; + + // apply mask + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < max - (ctx->n * std)) { + cur_p->data[i].logit = -INFINITY; + } + } + + llama_sampler_softmax_impl(cur_p, true); +} + +static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx; + return llama_sampler_init_top_n_sigma(ctx->n); +} + +static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_n_sigma *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_top_n_sigma_i = { + /* .name = */ llama_sampler_top_n_sigma_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_n_sigma_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_n_sigma_clone, + /* .free = */ llama_sampler_top_n_sigma_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_top_n_sigma(float n) { + const bool is_empty = (n <= 0.0f); + + if (is_empty) { + return llama_sampler_init_empty("?top-n-sigma"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_top_n_sigma_i, + /* .ctx = */ new llama_sampler_top_n_sigma { + /* .n = */ n, + } + ); +} + +// DRY + +struct llama_sampler_dry { + int32_t total_context_size; + + const float dry_multiplier; + const float dry_base; + const int32_t dry_allowed_length; + const int32_t dry_penalty_last_n; + + std::unordered_multimap> dry_processed_breakers; + std::vector dry_repeat_count; + std::unordered_map dry_max_token_repeat; + ring_buffer last_tokens; +}; + +// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) +static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { + for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) { + std::string word = vocab.detokenize({token_id}, true); + if (word.find(str) != std::string::npos) { + token_sequences.emplace(token_id, std::vector()); + } else { + size_t word_len = word.size(); + size_t str_len = str.size(); + size_t pos = -1; + while ((pos = word.find(str[0], pos + 1)) != std::string::npos) { + bool match = true; + size_t i; + for (i = 1; i < str_len && i + pos < word_len; ++i) { + if (word[pos + i] != str[i]) { + match = false; + break; + } + } + if (match) { + std::vector tokenization = vocab.tokenize(str.substr(i), false, false); + if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) { + tokenization.resize(max_tail_len); + } + + // Ensure we don't already have a duplicate matching tokenization + auto its = token_sequences.equal_range(token_id); + bool found = false; + for (auto it = its.first; it != its.second; ++it) { + if (tokenization == it->second) { + found = true; + break; + } + } + if (!found) { + token_sequences.emplace(token_id, tokenization); + } + } + } + } + } +} + +static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) { + return "dry"; +} + +static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { + return; + } + + ctx->last_tokens.push_back(token); +} + +// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) +static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + + if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { + return; + } + + int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0); + int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size); + + if (last_n_repeat <= ctx->dry_allowed_length) { + return; + } + + ctx->dry_repeat_count.assign(last_n_repeat, 0); + ctx->dry_max_token_repeat.clear(); + + // Step 1: Look for restart sequences to limit the maximum repetition length. + // Work backwards through the context looking for any token that begins a restart sequence. + // + // The collection `restart_sequences` is a mapping from a "head" token to all "tail" + // sequences that together comprise a restart sequence. This allows us to quickly check + // whether each token is the head of a complete sequence. Most restart sequences are actually + // a single token, and for these the "tail" is an empty vector. + // + // If the token is a "head", test all restart sequences that begin with this token + // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and + // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The + // longest matching sequence (if any) is used to limit the maximum repetition length. + // + // Note that in the case case of a short sequence contained in a longer one, this might fail to + // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as + // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress + // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare. + // + // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we + // have already clamped the maximum tail sequence length when generating `restart_sequences`. + // With clamping, this scan is O(N) in the context length. + + int rep_limit = last_n_repeat; + for (int i = 0; i < last_n_repeat; ++i) { + llama_token token = ctx->last_tokens.rat(i); + auto its = ctx->dry_processed_breakers.equal_range(token); + if (its.first == ctx->dry_processed_breakers.end()) { + continue; + } + int longest_match = -1; + for (auto it = its.first; it != its.second; ++it) { + // Note that (*it) does not contain the head character, so seq_len will be + // the restart sequence length minus 1. + // In the common case of a single-token restart sequence, (*it) will be empty + // and we will trivially match. + int seq_len = (int)it->second.size(); + if (seq_len > longest_match && seq_len <= (int)i) { + bool match = true; + for (int offset = 0; offset < seq_len; ++offset) { + // The -1 when indexing `last_tokens` is because we already matched the head. + if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) { + match = false; + break; + } + } + if (match) { + longest_match = seq_len; + } + } + } + if (longest_match >= 0) { + // We found a restart sequence starting `i` tokens from the end and continuing for + // `longest_match` tokens. + rep_limit = i - longest_match; + break; + } + } + if (rep_limit < ctx->dry_allowed_length) { + return; + } + + // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in + // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing + // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences. + // + // This algorithm is not currently documented on Wikipedia, but there is a clear description here: + // https://ivanyu.me/blog/2014/10/15/z-algorithm/ + // + // The code below is adapted from the public domain implementation by the same author here: + // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py + // + // Example: + // Last N tokens: a b c c b c y a b c + // Repeat counts: 0 0 3 1 0 2 0 0 0 0 + // ^ + // This `3` means that the last three tokens of the context (a b c) also appear here. + // + // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested + // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each + // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables + // ensure that the inner while loops only examine each token in the context once as the outer + // for loop iterates over the context. + + { + const int last = last_n_repeat - 1; + + int rt = 0; + int lt = 0; + + for (int k = 1; k < last_n_repeat; ++k) { + if (k > rt) { + // If k is outside the current Z-box, do naive computation. + int n = 0; + while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) { + ++n; + } + ctx->dry_repeat_count[last - k] = std::min(n, rep_limit); + if (n > 0) { + lt = k; + rt = k + n - 1; + } + } else { + // If k is inside the current Z-box, consider two cases. + + int p = k - lt; // Pair index. + int right_part_len = rt - k + 1; + + if (ctx->dry_repeat_count[last - p] < right_part_len) { + int n = std::min(ctx->dry_repeat_count[last - p], rep_limit); + ctx->dry_repeat_count[last - k] = n; + } else { + int i = rt + 1; + while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) { + i += 1; + } + + int n = std::min(i - k, rep_limit); + ctx->dry_repeat_count[last - k] = n; + lt = k; + rt = i - 1; + } + } + } + } + + // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length + // that would be generated by emitting each new token that would extend a sequence. + // + // Following the same example as above: + // Last N tokens: a b c c b c y a b c + // Repeat counts: 0 0 3 1 0 2 0 0 0 0 + // + // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition. + // c: 3 -> 4 (from `a b c` to `a b c c`) + // b: 1 -> 2 (from `c` to `c b`) + // y: 2 -> 3 (from `b c` to `b c y`) + + for (int i = 0; i < last_n_repeat - 1; ++i) { + int repeat_len = ctx->dry_repeat_count[i]; + if (repeat_len >= ctx->dry_allowed_length) { + // This token ends a repeat, so the next token would continue one. + // By convention, the value of `repeat_len` only includes the tokens currently + // in the context, not the new token that would be added. + llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i); + // Track the maximum sequence ending in this token. + const auto& it = ctx->dry_max_token_repeat.find(token); + if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) { + ctx->dry_max_token_repeat[token] = repeat_len; + } + } + } + + // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens. + + // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`. + // Compute it from `penalty_base` and the approximate log of `std::numeric_limits::max()` + const float FLOAT_MAX_LOG = 88.7228391f; + int max_exponent = 0; + if (ctx->dry_base > 1.000001f) { + max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base); + } + + for (size_t i = 0; i < cur_p->size; ++i) { + const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id); + if (af_kvp != ctx->dry_max_token_repeat.end()) { + // Check all sequence breakers starting with this token + auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id); + bool is_single_token_breaker = false; + + for (auto it = range.first; it != range.second; ++it) { + if (it->second.empty()) { + is_single_token_breaker = true; + break; + } + } + + // Apply penalty only if it's not a single-token sequence breaker + if (!is_single_token_breaker) { + int repeat_exp = af_kvp->second - ctx->dry_allowed_length; + if (max_exponent > 0 && repeat_exp > max_exponent) { + repeat_exp = max_exponent; + } + float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp); + cur_p->data[i].logit -= penalty; + } + } + } + + cur_p->sorted = false; +} + +static void llama_sampler_dry_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + ctx->last_tokens.clear(); + ctx->dry_repeat_count.clear(); + ctx->dry_max_token_repeat.clear(); +} + +static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) { + const auto * ctx = (llama_sampler_dry *) smpl->ctx; + + llama_vocab dummy_vocab; + + // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying + auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); + + // Copy the state, including the processed breakers + { + auto * result_ctx = (llama_sampler_dry *) result->ctx; + result_ctx->dry_processed_breakers = ctx->dry_processed_breakers; + result_ctx->dry_repeat_count = ctx->dry_repeat_count; + result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat; + result_ctx->last_tokens = ctx->last_tokens; + } + + return result; +} + +static void llama_sampler_dry_free(struct llama_sampler * smpl) { + delete (llama_sampler_dry *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_dry_i = { + /* .name = */ llama_sampler_dry_name, + /* .accept = */ llama_sampler_dry_accept, + /* .apply = */ llama_sampler_dry_apply, + /* .reset = */ llama_sampler_dry_reset, + /* .clone = */ llama_sampler_dry_clone, + /* .free = */ llama_sampler_dry_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) { + int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0); + std::unordered_multimap> processed_breakers; + const int MAX_CHAR_LEN = 40; + const int MAX_SEQ_LEN = 20; + + const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0); + + if (!dry_enabled) { + return llama_sampler_init_empty("?dry"); + } + + if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) { + // Process sequence breakers + for (size_t i = 0; i < num_breakers; ++i) { + if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) { + LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i); + continue; + } + + std::string sequence_break(seq_breakers[i]); + if (sequence_break.empty()) { + LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n"); + continue; + } + + if (sequence_break.size() > MAX_CHAR_LEN) { + LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN); + sequence_break.resize(MAX_CHAR_LEN); + } + + get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN); + } + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_dry_i, + /* .ctx = */ new llama_sampler_dry { + /* .total_context_size = */ n_ctx_train, + /* .dry_multiplier = */ dry_multiplier, + /* .dry_base = */ dry_base, + /* .dry_allowed_length = */ dry_allowed_length, + /* .dry_penalty_last_n = */ dry_penalty_last_n, + /* .dry_processed_breakers = */ std::move(processed_breakers), + /* .dry_repeat_count = */ dry_enabled ? std::vector(effective_dry_penalty_last_n, 0) : std::vector{}, + /* .dry_max_token_repeat = */ {}, + /* .last_tokens = */ dry_enabled ? ring_buffer(effective_dry_penalty_last_n) : ring_buffer(0), + } + ); +} + +// wrapper for test-sampling.cpp +struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector>& seq_breakers) { + llama_vocab dummy_vocab; + auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0); + auto * ctx = (llama_sampler_dry *) result->ctx; + + // Process the token-based sequence breakers + ctx->dry_processed_breakers.clear(); + if (seq_breakers.empty()) { + LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n"); + } else { + for (const auto& breaker : seq_breakers) { + if (breaker.empty()) { + LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n"); + continue; + } + llama_token head_token = breaker[0]; + std::vector tail_tokens(breaker.begin() + 1, breaker.end()); + ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens)); + } + + if (ctx->dry_processed_breakers.empty()) { + LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n"); + } + } + + return result; +} + +// adaptive-p sampler state +// +// maintains an exponential moving average of the *ORIGINAL* probabilities +// of selected tokens, used to compute an adapted target at each sampling step. +// +// see llama.h for a full description of the sampler +// +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +// +struct llama_sampler_adaptive_p { + const float target; // target probability (0.0 - 1.0; negative = disabled) + const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) + const uint32_t seed; // original RNG seed + uint32_t seed_cur; // actual RNG seed + std::mt19937 rng; // RNG state + float weighted_sum; // sum(p_i * decay^i) + float total_weight; // sum(decay^i), converges to 1/(1-decay) + std::vector original_probs; // pre-transform probs, cached for EMA update + llama_token pending_token_id; // token ID of selected token + int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs +}; + +// adaptive probability transformation constants +static constexpr float DISTRIBUTION_WIDTH = 0.3f; +static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float SHARPNESS = 10.0f; +static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; + +static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) { + return "adaptive-p"; +} + +static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p, false); + + if (ctx->target < 0.0f) { + // at negative target values, adaptive-p is no-op + // we simply sample from the existing distribution + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); + return; + } + + // store the original probabilities + ctx->original_probs.resize(cur_p->size); + for (size_t i = 0; i < cur_p->size; ++i) { + ctx->original_probs[i] = cur_p->data[i].p; + } + + // using the EMA, compute the adapted target probability for the current sampling step + auto target = std::clamp(ctx->target, 0.0f, 1.0f); + float adapted_target = std::clamp( + ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight), + 0.0f, 1.0f + ); + + // adaptive probability transform + // + // quadratic near target for fine differentiation, transitioning to linear decay in the + // tails. unbounded negative logits ensure proper suppression of far-from-target tokens + // after the softmax. + // + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit == -INFINITY) { + // don't transform logits that are -INFINITY + // (as masked out by e.g. min-p and top-p when using backend sampling) + continue; + } + float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); + cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist); + } + + // softmax and sample from the transformed distribution + llama_sampler_softmax_impl(cur_p, false); + const int idx = llama_sample_dist(cur_p, ctx->rng); + cur_p->selected = idx; + + // store the selected token ID for acceptance later + ctx->pending_token_id = cur_p->data[idx].id; + ctx->pending_token_idx = idx; +} + +static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + if (ctx->pending_token_id == token) { + GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL); + GGML_ASSERT(ctx->pending_token_idx != -1); + // update EMA with the original probability of the selected token + ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; + } + ctx->pending_token_id = LLAMA_TOKEN_NULL; + ctx->pending_token_idx = -1; +} + +static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + // ctx->target and ctx->decay never change after init, so it's safe to keep them as is. + // original_probs is completely overwritten on every call to _apply. + // so we only need to reset the EMA state and pending token. + ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); + ctx->total_weight = 1.0f / (1.0f - ctx->decay); + ctx->pending_token_id = LLAMA_TOKEN_NULL; + ctx->pending_token_idx = -1; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx; + auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); + auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; + + // copy everything (target, decay, seed, and RNG are already set) + result_ctx->weighted_sum = ctx->weighted_sum; + result_ctx->total_weight = ctx->total_weight; + result_ctx->pending_token_id = ctx->pending_token_id; + result_ctx->pending_token_idx = ctx->pending_token_idx; + + return result; +} + +static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_adaptive_p *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_adaptive_p_i = { + /* .name = */ llama_sampler_adaptive_p_name, + /* .accept = */ llama_sampler_adaptive_p_accept, + /* .apply = */ llama_sampler_adaptive_p_apply, + /* .reset = */ llama_sampler_adaptive_p_reset, + /* .clone = */ llama_sampler_adaptive_p_clone, + /* .free = */ llama_sampler_adaptive_p_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_adaptive_p( + float target, + float decay, + uint32_t seed +) { + auto seed_cur = get_rng_seed(seed); + float clamped_decay = std::clamp(decay, 0.0f, 0.99f); + return llama_sampler_init( + /* .iface = */ &llama_sampler_adaptive_p_i, + /* .ctx = */ new llama_sampler_adaptive_p { + /* .target = */ target, + /* .decay = */ clamped_decay, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .weighted_sum = */ target / (1.0f - clamped_decay), + /* .total_weight = */ 1.0f / (1.0f - clamped_decay), + /* .original_probs = */ {}, + /* .pending_token_id = */ LLAMA_TOKEN_NULL, + /* .pending_token_idx = */ -1 + } + ); +} + +// logit-bias + +struct llama_sampler_logit_bias : public llama_sampler_backend { + const int32_t n_vocab; + + const std::vector logit_bias; + + std::vector to_search; + + struct ggml_tensor * inp_logit_bias; + struct ggml_tensor * inp_logit_idxs; +}; + +static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + return ctx->get_name(); +} + +static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + + if (ctx->logit_bias.empty()) { + return; + } + + ctx->to_search.clear(); + + // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) + for (const auto & lb : ctx->logit_bias) { + if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) { + cur_p->data[lb.token].logit += lb.bias; + } else { + ctx->to_search.push_back(lb); + } + } + + if (ctx->to_search.empty()) { + return; + } + + // search for the remaining candidates that were not found in the previous step + for (size_t i = 0; i < cur_p->size; ++i) { + for (const auto & lb : ctx->to_search) { + if (cur_p->data[i].id == lb.token) { + cur_p->data[i].logit += lb.bias; + break; + } + } + } +} + +static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; + return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); +} + +static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) { + delete (llama_sampler_logit_bias *) smpl->ctx; +} + +static void llama_sampler_logit_bias_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(gf); + GGML_UNUSED(ctx); + + auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; + if (sctx->logit_bias.empty()) { + return; + } + + const size_t n = sctx->logit_bias.size(); + + sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n); + ggml_set_name(sctx->inp_logit_bias, "logit_bias"); + ggml_set_input(sctx->inp_logit_bias); + + sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n); + ggml_set_name(sctx->inp_logit_idxs, "logit_idxs"); + ggml_set_input(sctx->inp_logit_idxs); + + ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f); + + cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur)); + cur = ggml_set_rows(ctx, cur, sctx->inp_logit_bias, sctx->inp_logit_idxs); + cur = ggml_reshape_1d(ctx, cur, ggml_nelements(cur)); + + data->logits = ggml_add(ctx, data->logits, cur); +} + +static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; + if (sctx->logit_bias.empty()) { + return; + } + + GGML_ASSERT(sctx->inp_logit_bias != nullptr); + GGML_ASSERT(sctx->inp_logit_idxs != nullptr); + + const size_t n = sctx->logit_bias.size(); + + std::vector data_logit_bias(n, 0.0f); + std::vector data_logit_idxs(n, 0); + for (size_t i = 0; i < n; ++i) { + const auto & lb = sctx->logit_bias[i]; + GGML_ASSERT(lb.token >= 0 && lb.token < (int32_t) sctx->n_vocab); + data_logit_bias[i] = lb.bias; + data_logit_idxs[i] = lb.token; + } + + ggml_backend_tensor_set(sctx->inp_logit_bias, data_logit_bias.data(), 0, ggml_nbytes(sctx->inp_logit_bias)); + ggml_backend_tensor_set(sctx->inp_logit_idxs, data_logit_idxs.data(), 0, ggml_nbytes(sctx->inp_logit_idxs)); +} + +static bool llama_sampler_logit_bias_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; + + sctx->init(true); + + if (sctx->logit_bias.empty()) { + return true; + } + + return true; +} + +static struct llama_sampler_i llama_sampler_logit_bias_i = { + /* .name = */ llama_sampler_logit_bias_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_logit_bias_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_logit_bias_clone, + /* .free = */ llama_sampler_logit_bias_free, + /* .backend_init = */ llama_sampler_logit_bias_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_logit_bias_backend_apply, + /* .backend_set_input = */ llama_sampler_logit_bias_backend_set_input, +}; + +struct llama_sampler * llama_sampler_init_logit_bias( + int32_t n_vocab, + int32_t n_logit_bias, + const llama_logit_bias * logit_bias) { + const bool is_empty = n_logit_bias <= 0; + + if (is_empty) { + return llama_sampler_init_empty("?logit-bias"); + } + + return llama_sampler_init( + /* .iface = */ &llama_sampler_logit_bias_i, + /* .ctx = */ new llama_sampler_logit_bias { + ("logit-bias"), + /* .n_vocab = */ n_vocab, + /* .logit_bias = */ std::vector(logit_bias, logit_bias + n_logit_bias), + /* .to_search = */ {}, + /* .inp_logit_bias = */ nullptr, + /* .inp_logit_idxs = */ nullptr, + } + ); +} + +// infill + +//#define GGML_DEBUG_SAMPLER_INFILL + +struct llama_sampler_infill { + const struct llama_vocab * vocab; + + std::vector buf0; + std::vector buf1; +}; + +static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { + return "infill"; +} + +static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_infill *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p, true); + +#if defined(GGML_DEBUG_SAMPLER_INFILL) +#define LOG_DBG_CUR LLAMA_LOG_DEBUG +#else +#define LOG_DBG_CUR(...) +#endif + + for (size_t i = 0; i < cur_p->size; ++i) { + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + float p_txt_sum = 0.0f; + float p_eog_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (ctx->vocab->is_eog(cur_p->data[i].id)) { + p_eog_sum += cur_p->data[i].p; + } else { + p_txt_sum += cur_p->data[i].p; + } + } + + const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); + + LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (3*p_eog_sum*cur_p->size > p_txt_sum) { + LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); + + // keep just the EOG tokens + const auto size_org = cur_p->size; + + cur_p->size = 0; + + float p_sum = 0.0f; + + for (size_t i = 0; i < size_org; ++i) { + if (ctx->vocab->is_eog(cur_p->data[i].id)) { + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + } + + return; + } + + size_t n_combined = 0; GGML_UNUSED(n_combined); + + // combine tokens with common prefix + for (size_t i0 = 0; i0 < cur_p->size; ++i0) { + for (size_t i1 = 0; i1 < cur_p->size; ++i1) { + if (cur_p->data[i0].logit == -INFINITY) { + break; + } + + if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) { + continue; + } + + int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); + if (len0 < 0) { + ctx->buf0.resize(len0); + len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); + assert(len0 > 0); + } + + int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); + if (len1 < 0) { + ctx->buf1.resize(len1); + len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); + assert(len1 > 0); + } + + // token i0 is a prefix of token i1 + if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) { + int dst = i0; + int src = i1; + + // merge into the token with higher probability + if (cur_p->data[i1].p > cur_p->data[i0].p) { + std::swap(dst, src); + } + + cur_p->data[dst].p += cur_p->data[src].p; + cur_p->data[src].logit = -INFINITY; + cur_p->data[src].p = 0.0f; + + n_combined++; + } + } + } + + size_t n_non_eog = 0; + + size_t size_org = cur_p->size; + + float p_sum = 0.0f; + float thold = 0.2f; + + cur_p->size = 0; + + LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + if (!is_eog) { + ++n_non_eog; + } + + p_sum += cur_p->data[i].p; + + // keep this token + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); + + // if no non-EOG tokens are left -> reduce cur_p to single EOT token + if (n_non_eog == 0) { + cur_p->size = 1; + cur_p->data[0].id = ctx->vocab->token_eot(); + if (cur_p->data[0].id == LLAMA_TOKEN_NULL) { + cur_p->data[0].id = ctx->vocab->token_eos(); + } + cur_p->data[0].logit = 1.0f; + + GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL); + + return; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + size_org = cur_p->size; + p_sum = 0.0f; + thold = 1.0/(n_non_eog + 1); + + cur_p->size = 0; + + LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + +#undef LOG_DBG_CUR +} + +static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_infill *) smpl->ctx; + return llama_sampler_init_infill(ctx->vocab); +} + +static void llama_sampler_infill_free(struct llama_sampler * smpl) { + delete (llama_sampler_infill *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_infill_i = { + /* .name = */ llama_sampler_infill_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_infill_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_infill_clone, + /* .free = */ llama_sampler_infill_free, + /* .backend_apply = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_set_input = */ nullptr, + /* .backend_init = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) { + return llama_sampler_init( + /* .iface = */ &llama_sampler_infill_i, + /* .ctx = */ new llama_sampler_infill { + /* .vocab = */ vocab, + /* .buf0 = */ std::vector(512), + /* .buf1 = */ std::vector(512), + } + ); +} + +// utils + +uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { + if (smpl->iface == &llama_sampler_dist_i) { + return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_i) { + return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_v2_i) { + return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_chain_i) { + const auto * ctx = (const llama_sampler_chain *) smpl->ctx; + for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) { + const uint32_t seed = llama_sampler_get_seed(it->ptr); + if (seed != LLAMA_DEFAULT_SEED) { + return seed; + } + } + } + + return LLAMA_DEFAULT_SEED; +} + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * ctx->t_sample_us; + data.n_sample = std::max(0, ctx->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + auto * ctx = (struct llama_sampler_chain *) chain->ctx; + + ctx->t_sample_us = 0; + ctx->n_sample = 0; +} diff --git a/src/llama-sampler.h b/src/llama-sampler.h new file mode 100644 index 000000000..b9bfc20d2 --- /dev/null +++ b/src/llama-sampler.h @@ -0,0 +1,42 @@ +#pragma once + +#include "llama.h" + +#include + +struct llama_vocab; +struct llama_grammar; + +// sampler chain + +struct llama_sampler_chain { + llama_sampler_chain_params params; + + // has .backend_init() been called? + bool is_init = false; + + struct info { + bool is_backend; + + llama_sampler * ptr; + }; + + std::vector samplers; + + // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations + std::vector cur; + + // timing + + mutable int64_t t_sample_us; + + mutable int32_t n_sample; +}; + +struct llama_sampler * llama_sampler_init_dry_testing( + int32_t context_size, + float dry_multiplier, + float dry_base, + int32_t dry_allowed_length, + int32_t dry_penalty_last_n, + const std::vector> & seq_breakers); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp deleted file mode 100644 index 515d6c163..000000000 --- a/src/llama-sampling.cpp +++ /dev/null @@ -1,3885 +0,0 @@ -#include "llama-sampling.h" - -#include "llama-impl.h" -#include "llama-vocab.h" -#include "llama-grammar.h" - -#include "ggml-cpp.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// the ring buffer works similarly to std::deque, but with a fixed capacity -template -struct ring_buffer { - ring_buffer(size_t cap) : capacity(cap), data(cap) {} - - T & front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - const T & front() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - T & back() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - const T & back() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - void push_back(const T & value) { - if (capacity == 0) { - throw std::runtime_error("ring buffer: capacity is zero"); - } - - if (sz == capacity) { - // advance the start when buffer is full - first = (first + 1) % capacity; - } else { - sz++; - } - data[pos] = value; - pos = (pos + 1) % capacity; - } - - T pop_front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - T value = data[first]; - first = (first + 1) % capacity; - sz--; - return value; - } - - //T & operator[](size_t i) { - // if (i >= sz) { - // throw std::runtime_error("ring buffer: index out of bounds"); - // } - // return data[(first + i) % capacity]; - //} - - //const T & at(size_t i) const { - // if (i >= sz) { - // throw std::runtime_error("ring buffer: index out of bounds"); - // } - // return data[(first + i) % capacity]; - //} - - const T & rat(size_t i) const { - if (i >= sz) { - throw std::runtime_error("ring buffer: index out of bounds"); - } - return data[(first + sz - i - 1) % capacity]; - } - - std::vector to_vector() const { - std::vector result; - result.reserve(sz); - for (size_t i = 0; i < sz; i++) { - result.push_back(data[(first + i) % capacity]); - } - return result; - } - - void clear() { - // here only reset the status of the buffer - sz = 0; - first = 0; - pos = 0; - } - - bool empty() const { - return sz == 0; - } - - size_t size() const { - return sz; - } - - size_t capacity = 0; - size_t sz = 0; - size_t first = 0; - size_t pos = 0; - - std::vector data; -}; - -// writes result in res, does not mutate cur -static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector & res) { - static const auto comp = [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }; - - constexpr int nbuckets = 128; - constexpr float bucket_low = -10.0f; - constexpr float bucket_high = 10.0f; - constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); - constexpr float bucket_inter = -bucket_low * bucket_scale; - - std::vector bucket_idx; - std::vector histo(nbuckets, 0); - - std::vector bucket_ptrs; - - bucket_idx.reserve(cur.size); - - for (int i = 0; i < (int)cur.size; ++i) { - const float val = cur.data[i].logit; - int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); - ib = std::max(0, std::min(nbuckets - 1, ib)); - bucket_idx.push_back(ib); - ++histo[ib]; - } - int nhave = 0; - int ib = nbuckets - 1; - for ( ; ib >= 0; --ib) { - nhave += histo[ib]; - if (nhave >= npartial) { - break; - } - } - res.resize(nhave); - auto * ptr = res.data(); - bucket_ptrs.reserve(nbuckets - ib); - for (int j = nbuckets - 1; j >= ib; --j) { - bucket_ptrs.push_back(ptr); - ptr += histo[j]; - } - for (int i = 0; i < (int)cur.size; ++i) { - int j = bucket_idx[i]; - if (j >= ib) { - *bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i]; - } - } - - ptr = res.data(); - int ndone = 0; - for (int j = nbuckets - 1; j > ib; --j) { - std::sort(ptr, ptr + histo[j], comp); - ptr += histo[j]; - ndone += histo[j]; - } - std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp); -} - -// reduces the size of cur_p to npartial, keeping only the top npartial elements -static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) { - static const auto comp = [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }; - - if (npartial <= 128) { - std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp); - - cur_p->size = npartial; - cur_p->sorted = true; - - return; - } - - std::vector tmp; - - llama_token_data_array_partial_sort(*cur_p, npartial, tmp); - - std::copy(tmp.data(), tmp.data() + npartial, cur_p->data); - - cur_p->size = npartial; - cur_p->sorted = true; -} - -static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { - // iterator for the probabilities -#ifdef __GNUC__ - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wunused-local-typedefs" -#endif - - struct probs_iterator { - typedef std::input_iterator_tag iterator_category; - typedef float value_type; - typedef float * pointer; - typedef float & reference; - typedef ptrdiff_t difference_type; - - const llama_token_data * data; - - bool operator==(const probs_iterator & other) const { return data == other.data; } - bool operator!=(const probs_iterator & other) const { return data != other.data; } - const float & operator*() const { return data->p; } - probs_iterator & operator++() { ++data; return *this; } - probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; } - }; - -#ifdef __GNUC__ - #pragma GCC diagnostic pop -#endif - - std::discrete_distribution dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size}); - - return dist(rng); -} - -/* -static void llama_log_softmax(float * array, size_t size) { - float max_l = *std::max_element(array, array + size); - float sum = 0.f; - for (size_t i = 0; i < size; ++i) { - float p = expf(array[i] - max_l); - sum += p; - array[i] = p; - } - - for (size_t i = 0; i < size; ++i) { - array[i] = logf(array[i] / sum); - } -} -*/ - -static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { - if (temp <= 0.0f) { - // find the token with the highest logit and set the rest to -inf - size_t max_i = 0; - float max_l = cur_p->data[0].logit; - - for (size_t i = 1; i < cur_p->size; ++i) { - if (cur_p->data[i ].logit > max_l) { - cur_p->data[max_i].logit = -INFINITY; - max_i = i; - max_l = cur_p->data[i].logit; - } else { - cur_p->data[i].logit = -INFINITY; - } - } - - return; - } - - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= temp; - } -} - -static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) { - GGML_ASSERT(cur_p->size > 0); - - // Sort the logits in descending order if requested - if (do_sort && !cur_p->sorted) { - llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); - } - - float max_l = cur_p->data[0].logit; - if (!cur_p->sorted) { - for (size_t i = 1; i < cur_p->size; ++i) { - max_l = std::max(max_l, cur_p->data[i].logit); - } - } - - float cum_sum = 0.0f; - - for (size_t i = 0; i < cur_p->size; ++i) { - float p = expf(cur_p->data[i].logit - max_l); - cur_p->data[i].p = p; - cum_sum += p; - } - - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= cum_sum; - } -} - -static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) { - // if (k >= (int32_t)cur_p->size) { - // return; - // } - - if (k <= 0) { - return; - } - - k = std::min(k, (int) cur_p->size); - - // Sort scores in descending order - if (!cur_p->sorted) { - llama_token_data_array_partial_sort_inplace(cur_p, k); - } - - cur_p->size = k; -} - -static uint32_t get_rng_seed(uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - // use system clock if std::random_device is not a true RNG - static bool is_rd_prng = std::random_device().entropy() == 0; - if (is_rd_prng) { - return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count(); - } - std::random_device rd; - return rd(); - } - return seed; -} - -// llama_sampler API - -struct llama_sampler * llama_sampler_init( - struct llama_sampler_i * iface, - llama_sampler_context_t ctx) { - return new llama_sampler { - /* .iface = */ iface, - /* .ctx = */ ctx, - }; -} - -const char * llama_sampler_name(const struct llama_sampler * smpl) { - if (!smpl->iface) { - return "(null)"; - } - - return smpl->iface->name(smpl); -} - -void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) { - if (!smpl) { - return; - } - - if (smpl->iface->accept) { - smpl->iface->accept(smpl, token); - } -} - -void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) { - if (!smpl) { - return; - } - - GGML_ASSERT(smpl->iface->apply); - smpl->iface->apply(smpl, cur_p); -} - -void llama_sampler_reset(struct llama_sampler * smpl) { - if (!smpl) { - return; - } - - if (smpl->iface->reset) { - smpl->iface->reset(smpl); - } -} - -struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { - if (!smpl) { - return nullptr; - } - - if (smpl->iface->clone) { - return smpl->iface->clone(smpl); - } - - if (smpl->ctx == nullptr) { - return llama_sampler_init( - /* .iface = */ smpl->iface, - /* .ctx = */ nullptr - ); - } - - GGML_ABORT("the sampler does not support cloning"); -} - -void llama_sampler_free(struct llama_sampler * smpl) { - if (smpl == nullptr) { - return; - } - - if (smpl->iface->free) { - smpl->iface->free(smpl); - } - - delete smpl; -} - -// empty sampler - -struct llama_sampler_empty { - const char * name; -}; - -static struct llama_sampler * llama_sampler_init_empty(const char * name); - -static const char * llama_sampler_empty_name(const struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_empty *) smpl->ctx; - return ctx->name; -} - -static void llama_sampler_empty_accept(struct llama_sampler * smpl, llama_token token) { - GGML_UNUSED(smpl); - GGML_UNUSED(token); -} - -static void llama_sampler_empty_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - GGML_UNUSED(smpl); - GGML_UNUSED(cur_p); -} - -static void llama_sampler_empty_reset(struct llama_sampler * smpl) { - GGML_UNUSED(smpl); -} - -static struct llama_sampler * llama_sampler_empty_clone(const struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_empty *) smpl->ctx; - return llama_sampler_init_empty(ctx->name); -} - -static void llama_sampler_empty_free(struct llama_sampler * smpl) { - delete (llama_sampler_empty *) smpl->ctx; -} - -static bool llama_sampler_empty_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - GGML_UNUSED(smpl); - GGML_UNUSED(buft); - - return true; -} - -static void llama_sampler_empty_backend_accept( - struct llama_sampler * smpl, - ggml_context * ctx, - ggml_cgraph * gf, - struct ggml_tensor * selected_token) { - GGML_UNUSED(smpl); - GGML_UNUSED(ctx); - GGML_UNUSED(gf); - GGML_UNUSED(selected_token); -} - -static void llama_sampler_empty_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(smpl); - GGML_UNUSED(ctx); - GGML_UNUSED(gf); - GGML_UNUSED(data); -} - -static void llama_sampler_empty_backend_set_input(struct llama_sampler * smpl) { - GGML_UNUSED(smpl); -} - -static struct llama_sampler_i llama_sampler_empty_i = { - /* .name = */ llama_sampler_empty_name, - /* .accept = */ llama_sampler_empty_accept, - /* .apply = */ llama_sampler_empty_apply, - /* .reset = */ llama_sampler_empty_reset, - /* .clone = */ llama_sampler_empty_clone, - /* .free = */ llama_sampler_empty_free, - /* .backend_init = */ llama_sampler_empty_backend_init, - /* .backend_accept = */ llama_sampler_empty_backend_accept, - /* .backend_apply = */ llama_sampler_empty_backend_apply, - /* .backend_set_input = */ llama_sampler_empty_backend_set_input, -}; - -struct llama_sampler * llama_sampler_init_empty(const char * name) { - return llama_sampler_init( - /* .iface = */ &llama_sampler_empty_i, - /* .ctx = */ new llama_sampler_empty { - /* .name = */ name, - } - ); -} - -// common backend sampler functionality -// -// +name : means that the sampler is support and will run on the backend -// -name : means that a ggml operator is not supported by the backend -// -struct llama_sampler_backend { - llama_sampler_backend(const char * name) : name(name), name_ext(name), is_init(false), support(false) {} - - const char * get_name() { - if (!is_init) { - return name.c_str(); - } - - if (support) { - name_ext = "+" + name; - } else { - name_ext = "-" + name; - } - - return name_ext.c_str(); - } - - void init(bool support) { - GGML_ASSERT(this->is_init == false); - - this->is_init = true; - this->support = support; - } - -private: - std::string name; - std::string name_ext; - - bool is_init; - bool support; -}; - -// check if all ggml ops used by the sampler are supported by the backend -static bool llama_sampler_backend_support( - llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * device = ggml_backend_buft_get_device(buft); - if (!device) { - // CPU backend always supported - return true; - } - - ggml_init_params params = { - /*.mem_size =*/ 128*ggml_tensor_overhead() + ggml_graph_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - ggml_context_ptr ctx_ptr { ggml_init(params) }; - if (!ctx_ptr) { - throw std::runtime_error(format("failed to create ggml context")); - } - - ggml_context * ctx = ctx_ptr.get(); - - const int64_t n = 1024*1024; - - llama_sampler_data data = { - /*.logits = */ ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n), - /*.probs = */ nullptr, - /*.sampled = */ nullptr, - /*.candidates = */ ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n), - }; - - ggml_cgraph * gf = ggml_new_graph(ctx); - - smpl->iface->backend_apply(smpl, ctx, gf, &data); - - if (data.logits) { - ggml_build_forward_expand(gf, data.logits); - } - - if (data.probs) { - ggml_build_forward_expand(gf, data.probs); - } - - if (data.sampled) { - ggml_build_forward_expand(gf, data.sampled); - } - - if (data.candidates) { - ggml_build_forward_expand(gf, data.candidates); - } - - for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { - struct ggml_tensor * op = ggml_graph_node(gf, i); - - if (!ggml_backend_dev_supports_op(device, op)) { - LLAMA_LOG_WARN("%s: device '%s' does not have support for op %s needed for sampler '%s'\n", - __func__, ggml_backend_dev_name(device), ggml_op_name(op->op), smpl->iface->name(smpl)); - - return false; - } - } - - return true; -} - -// sampler chain - -static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { - return "chain"; -} - -static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - for (auto & smpl : chain->samplers) { - llama_sampler_accept(smpl.ptr, token); - } - - chain->n_sample++; -} - -static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - bool is_backend = chain->is_init; - - for (auto & smpl : chain->samplers) { - if (is_backend && smpl.is_backend) { - continue; - } - - is_backend = false; - - if (smpl.ptr->iface->apply == nullptr) { - continue; - } - - llama_sampler_apply(smpl.ptr, cur_p); - } -} - -static void llama_sampler_chain_reset(struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto & smpl : chain->samplers) { - llama_sampler_reset(smpl.ptr); - } -} - -static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { - const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; - - auto * result = llama_sampler_chain_init(chain_src->params); - - for (const auto & smpl : chain_src->samplers) { - llama_sampler_chain_add(result, llama_sampler_clone(smpl.ptr)); - } - - return result; -} - -static void llama_sampler_chain_free(struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto & smpl : chain->samplers) { - llama_sampler_free(smpl.ptr); - } - - delete chain; -} - -static bool llama_sampler_chain_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - GGML_ASSERT(chain->is_init == false && "llama_sampler_chain_backend_init() called twice"); - - chain->is_init = true; - - bool res = true; - - for (auto & smpl : chain->samplers) { - bool res_cur = true; - - // to be able to run a sampler on the backend, it has to: - // - have the .backend_init() API implemented - // - return true during .backend_init() - if (smpl.ptr->iface->backend_init) { - if (!smpl.ptr->iface->backend_init(smpl.ptr, buft)) { - res_cur = false; - } - } else { - res_cur = false; - } - - smpl.is_backend = res_cur; - - res = res && res_cur; - } - - return res; -} - -static void llama_sampler_chain_backend_accept( - struct llama_sampler * smpl, - ggml_context * ctx, - ggml_cgraph * gf, - struct ggml_tensor * selected_token) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto & smpl : chain->samplers) { - if (!smpl.is_backend) { - break; - } - - if (smpl.ptr->iface->backend_accept) { - smpl.ptr->iface->backend_accept(smpl.ptr, ctx, gf, selected_token); - } - } -} - -static void llama_sampler_chain_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - GGML_ASSERT(chain->is_init && "llama_sampler_chain_backend_init() not called"); - - for (auto & smpl : chain->samplers) { - if (!smpl.is_backend) { - break; - } - - if (smpl.ptr->iface->backend_apply) { - smpl.ptr->iface->backend_apply(smpl.ptr, ctx, gf, data); - } - } -} - -static void llama_sampler_chain_backend_set_input(struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto & smpl : chain->samplers) { - if (!smpl.is_backend) { - break; - } - - if (smpl.ptr->iface->backend_set_input) { - smpl.ptr->iface->backend_set_input(smpl.ptr); - } - } -} - -static struct llama_sampler_i llama_sampler_chain_i = { - /* .name = */ llama_sampler_chain_name, - /* .accept = */ llama_sampler_chain_accept, - /* .apply = */ llama_sampler_chain_apply, - /* .reset = */ llama_sampler_chain_reset, - /* .clone = */ llama_sampler_chain_clone, - /* .free = */ llama_sampler_chain_free, - /* .backend_init = */ llama_sampler_chain_backend_init, - /* .backend_accept = */ llama_sampler_chain_backend_accept, - /* .backend_apply = */ llama_sampler_chain_backend_apply, - /* .backend_set_input = */ llama_sampler_chain_backend_set_input, -}; - -struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { - return llama_sampler_init( - /* .iface = */ &llama_sampler_chain_i, - /* .ctx = */ new llama_sampler_chain { - /* .params = */ params, - /* .is_init = */ false, - /* .samplers = */ {}, - /* .cur = */ {}, - /* .t_sample_us = */ 0, - /* .n_sample = */ 0, - } - ); -} - -llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { - const llama_token sampled_token = llama_get_sampled_token_ith (ctx, idx); - const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx); - const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx); - const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx); - - // If a backend sampler has already sampled a token, return it. - if (sampled_token != LLAMA_TOKEN_NULL) { - LLAMA_LOG_DEBUG("%s: Backend sampler selected token for idx %d. Skipping CPU samplers\n", __func__, idx); - return sampled_token; - } - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); - - // use pre-allocated buffer from chain if available, otherwise allocate locally - std::vector * cur_ptr; - std::vector cur_local; - - if (smpl->iface == &llama_sampler_chain_i) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - cur_ptr = &chain->cur; - } else { - cur_ptr = &cur_local; - } - - auto & cur = *cur_ptr; - - if (sampled_probs) { - const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx); - cur.resize(sampled_probs_count); - for (uint32_t i = 0; i < sampled_probs_count; ++i) { - cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]}; - } - } else if (sampled_logits) { - const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx); - cur.resize(sampled_logits_count); - for (llama_token i = 0; i < (int)sampled_logits_count; i++) { - cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f}; - } - } else { - const auto * logits = llama_get_logits_ith(ctx, idx); - GGML_ASSERT(logits != nullptr); - cur.resize(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; - } - } - - llama_token_data_array cur_p = { - /* .data = */ cur.data(), - /* .size = */ cur.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(smpl, &cur_p); - - GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); - - auto token = cur_p.data[cur_p.selected].id; - - llama_sampler_accept(smpl, token); - - return token; -} - - -void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { - auto * p = (llama_sampler_chain *) chain->ctx; - p->samplers.push_back({ - /* .is_backend = */ false, - /* .ptr = */ smpl, - }); -} - -struct llama_sampler * llama_sampler_chain_get(struct llama_sampler * chain, int32_t i) { - if (chain == nullptr) { - return nullptr; - } - - if (chain->iface != &llama_sampler_chain_i) { - return nullptr; - } - - if (i == -1) { - return chain; - } - - const auto * p = (const llama_sampler_chain *) chain->ctx; - - if (i < 0 || (size_t) i >= p->samplers.size()) { - return nullptr; - } - - return p->samplers[i].ptr; -} - -struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) { - auto * p = (llama_sampler_chain *) chain->ctx; - - if (i < 0 || (size_t) i >= p->samplers.size()) { - return nullptr; - } - - auto * result = p->samplers[i].ptr; - p->samplers.erase(p->samplers.begin() + i); - - return result; -} - -int llama_sampler_chain_n(const struct llama_sampler * chain) { - const auto * p = (const llama_sampler_chain *) chain->ctx; - - return p->samplers.size(); -} - -// -// samplers -// - -// greedy - -struct llama_sampler_greedy : public llama_sampler_backend { -}; - -static const char * llama_sampler_greedy_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_greedy *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_greedy_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_greedy *) smpl->ctx; - GGML_UNUSED(ctx); -} - -static struct llama_sampler * llama_sampler_greedy_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_greedy *) smpl->ctx; - auto * result = llama_sampler_init_greedy(); - - // copy the state - { - auto * result_ctx = (llama_sampler_greedy *) result->ctx; - - GGML_UNUSED(ctx); - GGML_UNUSED(result_ctx); - } - - return result; -} - -static void llama_sampler_greedy_free(struct llama_sampler * smpl) { - delete (llama_sampler_greedy *) smpl->ctx; -} - -static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { - cur_p->selected = 0; - for (size_t i = 1; i < cur_p->size; ++i) { - if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) { - cur_p->selected = i; - } - } -} - -static bool llama_sampler_greedy_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_greedy *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_greedy_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(gf); - GGML_UNUSED(smpl); - - struct ggml_tensor * curl = ggml_argmax(ctx, data->logits); - ggml_set_name(curl, "greedy_argmax"); - - data->sampled = curl; -} - -static struct llama_sampler_i llama_sampler_greedy_i = { - /* .name = */ llama_sampler_greedy_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_greedy_apply, - /* .reset = */ llama_sampler_greedy_reset, - /* .clone = */ llama_sampler_greedy_clone, - /* .free = */ llama_sampler_greedy_free, - /* .backend_init = */ llama_sampler_greedy_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_greedy_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_greedy() { - return llama_sampler_init( - /* .iface = */ &llama_sampler_greedy_i, - /* .ctx = */ new llama_sampler_greedy { - ("greedy"), - } - ); -} - -// dist - -struct llama_sampler_dist : public llama_sampler_backend { - const uint32_t seed; - uint32_t seed_cur; - - std::mt19937 rng; - - ggml_tensor * inp_uniform; -}; - -static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_dist *) smpl->ctx; - - // edge cases - if (cur_p->size == 0) { - cur_p->selected = -1; - return; - } - - cur_p->selected = 0; - - if (cur_p->size == 1) { - cur_p->data[0].p = 1.0f; - return; - } - - // max logit for numerical stability - float max_l = cur_p->data[0].logit; - if (!cur_p->sorted) { - for (size_t i = 1; i < cur_p->size; ++i) { - max_l = std::max(max_l, cur_p->data[i].logit); - } - } - - // apply softmax to obtain the probabilities - double sum_cum = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - float p = expf(cur_p->data[i].logit - max_l); - cur_p->data[i].p = p; - sum_cum += p; - } - -#if 1 - // sample from the obtained probabilities and normalize the probs in a single pass - // this is ~3x faster on Mac with full gpt-oss vocab than the version below - // - std::uniform_real_distribution dist(0.0f, 1.0f); - const double rnd = dist(ctx->rng); - - double sum_run = 0.0f; - const double sum_tgt = sum_cum*rnd; - - bool found = false; - for (size_t i = 0; i < cur_p->size; ++i) { - if (!found) { - // accumulate probs until we reach the target sum - sum_run += cur_p->data[i].p; - if (sum_run >= sum_tgt) { - cur_p->selected = i; - found = true; - } - } - - // normalize probs - cur_p->data[i].p /= sum_cum; - } - - // fallback to the last token (don't think this can happen) - assert(found); - if (!found) { - cur_p->selected = cur_p->size - 1; - } -#else - // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= sum_cum; - } - - cur_p->selected = llama_sample_dist(cur_p, ctx->rng); -#endif -} - -static void llama_sampler_dist_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_dist *) smpl->ctx; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); -} - -static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist *) smpl->ctx; - auto * result = llama_sampler_init_dist(ctx->seed); - - // copy the state - { - auto * result_ctx = (llama_sampler_dist *) result->ctx; - - result_ctx->rng = ctx->rng; - } - - return result; -} - -static void llama_sampler_dist_free(struct llama_sampler * smpl) { - delete (llama_sampler_dist *) smpl->ctx; -} - -static bool llama_sampler_dist_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_dist *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_dist_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(gf); - - auto * sctx = (llama_sampler_dist *) smpl->ctx; - - sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_set_name (sctx->inp_uniform, "uniform"); - ggml_set_input(sctx->inp_uniform); - - struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); - ggml_set_name(probs, "dist_probs"); - - struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); - ggml_set_name(cumsum, "dist_cumsum"); - - // The uniform tensor has a random value and we subtract this tensor with - // the cumsum tensor (the uniform tensor will be broadcasted by ggml_sub). - // Recall that each entry in cumsum is the cumulative probability up to that - // index so values stay negative while the cumulative total is below the - // random value, and become zero/positive once the threshold is crossed. - struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); - ggml_set_name(diff, "dist_cumsum"); - - // The ggml_step function produces a tensor where entries are 1 if the - // corresponding entry in diff is > 0, and 0 otherwise. So all values up to - // the index where the cumulative probability exceeds the random value are 0, - // and all entries after that are 1. - struct ggml_tensor * mask = ggml_step(ctx, diff); - ggml_set_name(mask, "dist_mask"); - - // Taking the sum of the mask gives us the sum of elements after the threshold - // we are interested in. - struct ggml_tensor * idxf = ggml_sum(ctx, mask); - ggml_set_name(idxf, "dist_index_f32"); - - // Use ggml_scale_bias to scale the index value by -1 and then add the size - // of the mask to that value so we get the correct index ((-1 * idxf) + n). - struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); - ggml_set_name(idx, "dist_index_i32"); - - // Map back to original vocab ids if a candidates tensor is available. - struct ggml_tensor * sampled_token = idx; - if (data->candidates != nullptr) { - struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); - - sampled_token = ggml_get_rows(ctx, candidates, idx); - ggml_set_name(sampled_token, "dist_sampled_token"); - } - - data->sampled = sampled_token; - data->probs = probs; -} - -static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist *) smpl->ctx; - - GGML_ASSERT(sctx->inp_uniform != nullptr); - - // We sample in double precision and cast to float to match rnd numbers of - // llama_dampler_dist which uses double precision (sampling from - // std::uniform_real_distribution and - // std::uniform_real_distribution with same rng will produce - // different sequences). - std::uniform_real_distribution dist(0.0f, 1.0f); - const float rnd = dist(sctx->rng); - - ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); -} - -static struct llama_sampler_i llama_sampler_dist_i = { - /* .name = */ llama_sampler_dist_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_dist_apply, - /* .reset = */ llama_sampler_dist_reset, - /* .clone = */ llama_sampler_dist_clone, - /* .free = */ llama_sampler_dist_free, - /* .backend_init = */ llama_sampler_dist_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_dist_backend_apply, - /* .backend_set_input = */ llama_sampler_dist_backend_set_input, -}; - -struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { - auto seed_cur = get_rng_seed(seed); - return llama_sampler_init( - /* .iface = */ &llama_sampler_dist_i, - /* .ctx = */ new llama_sampler_dist { - ("dist"), - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - /* .inp_uniform = */ nullptr, - } - ); -} - -// top-k - -struct llama_sampler_top_k : public llama_sampler_backend { - const int32_t k; -}; - -static const char * llama_sampler_top_k_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_top_k *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_top_k *) smpl->ctx; - llama_sampler_top_k_impl(cur_p, ctx->k); -} - -static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_top_k *) smpl->ctx; - return llama_sampler_init_top_k(ctx->k); -} - -static void llama_sampler_top_k_free(struct llama_sampler * smpl) { - delete (llama_sampler_top_k *) smpl->ctx; -} - -static bool llama_sampler_top_k_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_top_k *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_top_k_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * sctx = (llama_sampler_top_k *) smpl->ctx; - - struct ggml_tensor * top_k = ggml_top_k(ctx, data->logits, sctx->k); - ggml_set_name(top_k, "top_k"); - - if (data->candidates) { - struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]); - data->candidates = ggml_get_rows(ctx, candidates_rows, top_k); - data->candidates = ggml_reshape_1d(ctx, data->candidates, sctx->k); - ggml_set_name(data->candidates, "top_k_candidates"); - } else { - data->candidates = top_k; - } - - struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); - struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, logits_rows, top_k); - data->logits = ggml_reshape_1d(ctx, top_k_rows, sctx->k); - ggml_set_name(top_k_rows, "top_k_rows"); - - GGML_UNUSED(gf); -} - -static struct llama_sampler_i llama_sampler_top_k_i = { - /* .name = */ llama_sampler_top_k_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_top_k_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_top_k_clone, - /* .free = */ llama_sampler_top_k_free, - /* .backend_init = */ llama_sampler_top_k_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_top_k_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_top_k(int32_t k) { - const bool is_empty = (k <= 0); - - if (is_empty) { - return llama_sampler_init_empty("?top-k"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_top_k_i, - /* .ctx = */ new llama_sampler_top_k { - ("top-k"), - /* .k = */ k, - } - ); -} - -// top-p - -struct llama_sampler_top_p : public llama_sampler_backend { - const float p; - const size_t min_keep; - - std::vector buf_sort; -}; - -static const char * llama_sampler_top_p_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_top_p *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_top_p *) smpl->ctx; - - if (ctx->p >= 1.0f) { - return; - } - - llama_sampler_softmax_impl(cur_p, false); - - size_t k = cur_p->size; - auto * pdata = cur_p->data; - - auto & buf_sort = ctx->buf_sort; - - // if not sorted, try adaptive top-k sorting - if (!cur_p->sorted && cur_p->size > 1024) { - k = std::min(256, cur_p->size); - llama_token_data_array_partial_sort(*cur_p, k, buf_sort); - pdata = buf_sort.data(); - } else if (!cur_p->sorted) { - // small candidates -> sort inplace - llama_token_data_array_partial_sort_inplace(cur_p, k); - } - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = cur_p->size; - - for (size_t i = 0; i < cur_p->size; ++i) { - cum_sum += pdata[i].p; - - // Check if the running sum is at least p or if we have kept at least min_keep tokens - // we set the last index to i+1 to indicate that the current iterate should be included in the set - if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) { - last_idx = i + 1; - break; - } - - // we exceeded the current top-k heuristic -> increase k and continue - if (!cur_p->sorted && i == k - 1) { - k = cur_p->size; - llama_token_data_array_partial_sort(*cur_p, k, buf_sort); - pdata = buf_sort.data(); - } - } - - // Resize the output vector to keep only the top-p tokens - if (!cur_p->sorted) { - std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data); - cur_p->sorted = true; - } - - cur_p->size = last_idx; -} - -static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_top_p *) smpl->ctx; - return llama_sampler_init_top_p(ctx->p, ctx->min_keep); -} - -static void llama_sampler_top_p_free(struct llama_sampler * smpl) { - delete (llama_sampler_top_p *) smpl->ctx; -} - -static bool llama_sampler_top_p_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_top_p *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_top_p_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * sctx = (llama_sampler_top_p *) smpl->ctx; - - auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(ggml_nrows(a) == 1); - struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]); - struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b); - return ggml_reshape_1d(ctx, a_sorted, a->ne[0]); - }; - - // Get the sorted logits in descending order. - struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC); - ggml_set_name(sorted_idx, "top_p_sorted_idx"); - - // Do the sorting via reshape + get_rows - struct ggml_tensor * sorted_logits = ggml_sort(data->logits, sorted_idx); - ggml_set_name(sorted_logits, "top_p_sorted_logits"); - - struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits); - ggml_set_name(softmax, "top_p_softmax"); - - // If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates. - if (data->candidates) { - data->candidates = ggml_sort(data->candidates, sorted_idx); - } else { - data->candidates = sorted_idx; - } - ggml_set_name(data->candidates, "top_p_candidates"); - - // Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM. - struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax); - ggml_set_name(cdf, "top_p_cdf"); - - // Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep - struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p); - ggml_set_name(cdf_scaled, "top_p_cdf_scaled"); - - struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled); - ggml_set_name(mask, "top_p_mask"); - - // Taking the sum of the mask gives us the sum of elements after the threshold - // we are interested in. - struct ggml_tensor * idxf = ggml_sum(ctx, mask); - ggml_set_name(idxf, "top_p_index_f32"); - - // prevent out-of-bounds access - idxf = ggml_clamp(ctx, idxf, 0.0f, mask->ne[0] - 1); - - // construct ones tensor to set the value in the mask - struct ggml_tensor * ones = ggml_scale_bias(ctx, idxf, 0.0f, 1.0f); - ggml_set_name(ones, "top_p_ones"); - - // Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p) - struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]); - - mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32)); - mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]); - - // Apply -INFINITY bias for masked-out tokens - // log(1) = 0 (keep), log(0) = -INF (discard) - struct ggml_tensor * top_p_bias = ggml_log(ctx, mask); - ggml_set_name(top_p_bias, "top_p_bias"); - - data->logits = ggml_add(ctx, sorted_logits, top_p_bias); - ggml_set_name(data->logits, "top_p_logits"); - - GGML_UNUSED(gf); -} - -static struct llama_sampler_i llama_sampler_top_p_i = { - /* .name = */ llama_sampler_top_p_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_top_p_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_top_p_clone, - /* .free = */ llama_sampler_top_p_free, - /* .backend_init = */ llama_sampler_top_p_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_top_p_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { - const bool is_empty = p >= 1.0f; - - if (is_empty) { - return llama_sampler_init_empty("?top-p"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_top_p_i, - /* .ctx = */ new llama_sampler_top_p { - ("top-p"), - /* .p = */ p, - /* .min_keep = */ min_keep, - /* .buf_sort = */ {}, - } - ); -} - -// min-p - -struct llama_sampler_min_p : public llama_sampler_backend { - const float p; - const size_t min_keep; -}; - -static const char * llama_sampler_min_p_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_min_p *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_min_p *) smpl->ctx; - - if (ctx->p <= 0.0f || !cur_p->size) { - return; - } - - bool min_p_applied = false; - - // if the cur_p aren't sorted, try the unsorted implementation first - if (!cur_p->sorted) { - std::vector filtered_tokens; - - float max_logit = -FLT_MAX; - for (size_t i = 0; i < cur_p->size; ++i) { - max_logit = std::max(max_logit, cur_p->data[i].logit); - } - const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max - - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit >= min_logit) { - filtered_tokens.push_back(cur_p->data[i]); - } - } - - // if we have enough values the operation was a success - if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) { - std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data); - cur_p->size = filtered_tokens.size(); - min_p_applied = true; - } - } - - // if the cur_p are sorted or the unsorted implementation failed, use this implementation - if (!min_p_applied) { - // Sort the logits in descending order - if (!cur_p->sorted) { - llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); - } - - const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max - size_t i = 1; // first token always matches - - for (; i < cur_p->size; ++i) { - if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) { - break; // prob too small - } - } - - // Resize the output vector to keep only the matching tokens - cur_p->size = i; - } -} - -static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_min_p *) smpl->ctx; - return llama_sampler_init_min_p(ctx->p, ctx->min_keep); -} - -static void llama_sampler_min_p_free(struct llama_sampler * smpl) { - delete (llama_sampler_min_p *) smpl->ctx; -} - -static bool llama_sampler_min_p_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_min_p *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_min_p_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * sctx = (llama_sampler_min_p *) smpl->ctx; - - struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits); - ggml_set_name(max_idx, "max_idx"); - - struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); - ggml_set_name(logits_rows, "logits_rows"); - - struct ggml_tensor * max_logit = ggml_get_rows(ctx, logits_rows, max_idx); - ggml_set_name(max_logit, "max_logit"); - - // Calculate the threshold value. - struct ggml_tensor * threshold = ggml_scale_bias(ctx, max_logit, 1.0f, logf(sctx->p)); - ggml_set_name(threshold, "min_p_threshold"); - - // Subtract the threshold from logits. - struct ggml_tensor * sub = ggml_sub(ctx, data->logits, threshold); - - // Create a mask where logits below the threshold are 0 (discard), - // and others are 1 (keep). - struct ggml_tensor * mask = ggml_step(ctx, sub); - ggml_set_name(mask, "min_p_mask"); - - // Apply -INFINITY bias for masked-out tokens - // log(1) = 0 (keep), log(0) = -INF (discard) - struct ggml_tensor * min_p_bias = ggml_log(ctx, mask); - ggml_set_name(min_p_bias, "min_p_bias"); - - data->logits = ggml_add(ctx, data->logits, min_p_bias); - ggml_set_name(data->logits, "min_p_logits"); - - GGML_UNUSED(gf); -} - -static struct llama_sampler_i llama_sampler_min_p_i = { - /* .name = */ llama_sampler_min_p_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_min_p_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_min_p_clone, - /* .free = */ llama_sampler_min_p_free, - /* .backend_init = */ llama_sampler_min_p_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_min_p_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { - const bool is_empty = (p <= 0.0f); - - if (is_empty) { - return llama_sampler_init_empty("?min-p"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_min_p_i, - /* .ctx = */ new llama_sampler_min_p { - ("min-p"), - /* .p = */ p, - /* .min_keep = */ min_keep, - } - ); -} - -// typical - -struct llama_sampler_typical { - const float p; - const size_t min_keep; -}; - -static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) { - return "typical"; -} - -static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_typical *) smpl->ctx; - - // Reference implementation: - // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr - if (ctx->p >= 1.0f) { - return; - } - - // Compute the softmax of logits and calculate entropy - llama_sampler_softmax_impl(cur_p, true); - - float entropy = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); - } - - // Compute the absolute difference between negative log probability and entropy for each candidate - std::vector shifted_scores; - for (size_t i = 0; i < cur_p->size; ++i) { - float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); - shifted_scores.push_back(shifted_score); - } - - // Sort tokens based on the shifted_scores and their corresponding indices - std::vector indices(cur_p->size); - std::iota(indices.begin(), indices.end(), 0); - - std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { - return shifted_scores[a] < shifted_scores[b]; - }); - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = indices.size(); - - for (size_t i = 0; i < indices.size(); ++i) { - size_t idx = indices[i]; - cum_sum += cur_p->data[idx].p; - - // Check if the running sum is greater than typical or if we have kept at least min_keep tokens - if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) { - last_idx = i + 1; - break; - } - } - - // Resize the output vector to keep only the locally typical tokens - std::vector cur_p_new; - for (size_t i = 0; i < last_idx; ++i) { - size_t idx = indices[i]; - cur_p_new.push_back(cur_p->data[idx]); - } - - // Replace the data in cur_p with the cur_p_new data - std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); - cur_p->size = cur_p_new.size(); - cur_p->sorted = false; -} - -static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_typical *) smpl->ctx; - return llama_sampler_init_typical(ctx->p, ctx->min_keep); -} - -static void llama_sampler_typical_free(struct llama_sampler * smpl) { - delete (llama_sampler_typical *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_typical_i = { - /* .name = */ llama_sampler_typical_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_typical_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_typical_clone, - /* .free = */ llama_sampler_typical_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { - const bool is_empty = (p >= 1.0f); - - if (is_empty) { - return llama_sampler_init_empty("?typical"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_typical_i, - /* .ctx = */ new llama_sampler_typical { - /* .p = */ p, - /* .min_keep = */ min_keep, - } - ); -} - -// temp - -struct llama_sampler_temp : public llama_sampler_backend { - const float temp; -}; - -static const char * llama_sampler_temp_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_temp *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_temp *) smpl->ctx; - - llama_sampler_temp_impl(cur_p, ctx->temp); -} - -static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_temp *) smpl->ctx; - return llama_sampler_init_temp(ctx->temp); -} - -static void llama_sampler_temp_free(struct llama_sampler * smpl) { - delete (llama_sampler_temp *) smpl->ctx; -} - -static void llama_sampler_backend_temp_sampling( - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data, - float temp) { - if (temp <= 0.0f) { - // Find the most probable token index. - struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits); - ggml_set_name(max_idx, "temp_max_idx"); - - if (data->candidates) { - struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]); - data->candidates = ggml_get_rows(ctx, candidates_rows, max_idx); - } else { - data->candidates = max_idx; - } - - struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]); - data->logits = ggml_get_rows(ctx, logits_rows, max_idx); - - return; - } - - data->logits = ggml_scale(ctx, data->logits, 1.0f / temp); - - GGML_UNUSED(gf); -} - -static bool llama_sampler_temp_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_temp *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_temp_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * sctx = (llama_sampler_temp *) smpl->ctx; - llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp); -} - -static struct llama_sampler_i llama_sampler_temp_i = { - /* .name = */ llama_sampler_temp_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_temp_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_temp_clone, - /* .free = */ llama_sampler_temp_free, - /* .backend_init = */ llama_sampler_temp_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_temp_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_temp(float temp) { - const bool is_empty = temp == 1.0f; - - if (is_empty) { - return llama_sampler_init_empty("?temp"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_temp_i, - /* .ctx = */ new llama_sampler_temp { - ("temp"), - /*.temp = */ temp, - } - ); -} - -// temp-ext - -struct llama_sampler_temp_ext : public llama_sampler_backend { - const float temp; - const float delta; - const float exponent; -}; - -static const char * llama_sampler_temp_ext_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; - if (ctx->delta > 0) { - const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); - const float max_temp = ctx->temp + ctx->delta; - - float exponent_val = ctx->exponent; - - // no need to do anything if there is only one (or zero) candidates - if (cur_p->size <= 1) { - return; - } - - // Calculate maximum possible entropy - float max_entropy = -logf(1.0f / cur_p->size); - - llama_sampler_softmax_impl(cur_p, true); - - // Calculate entropy of the softmax probabilities - float entropy = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - float prob = cur_p->data[i].p; - if (prob > 0.0f) { // Ensure no log(0) - entropy -= prob * logf(prob); - } - } - - // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) - float normalized_entropy = entropy / max_entropy; - - // Map the normalized entropy to the desired temperature range using the power function - float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); - - #ifdef DEBUG - LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); - LLAMA_LOG_INFO("Entropy: %f\n", entropy); - LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); - LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); - LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); - LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); - #endif - - // Apply the dynamically calculated temperature scaling - llama_sampler_temp_impl(cur_p, dyn_temp); - - // Re-compute softmax probabilities after scaling logits with dynamic temperature - const double max_l_double = cur_p->data[0].logit; - - double cum_sum_double = 0.0; - for (size_t i = 0; i < cur_p->size; ++i) { - double p = exp(cur_p->data[i].logit - max_l_double); - cur_p->data[i].p = p; // Store the scaled probability - cum_sum_double += p; - } - - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities - } - - #ifdef DEBUG - // Print the updated top 25 probabilities after temperature scaling - LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); - for (size_t i = 0; i < 25 && i < cur_p->size; ++i) { - LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f); - } - #endif - } else { - llama_sampler_temp_impl(cur_p, ctx->temp); - } -} - -static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx; - return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent); -} - -static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) { - delete (llama_sampler_temp_ext *) smpl->ctx; -} - -static bool llama_sampler_temp_ext_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_temp_ext_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - auto * sctx = (llama_sampler_temp_ext *) smpl->ctx; - - // Revert to standard temperature scaling if delta or temp are non-positive. - if (sctx->delta <= 0.0f || sctx->temp <= 0.0f) { - llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp); - return; - } - - // Calculate min_temp, max_temp, and max_entropy. - const float min_temp = std::max(0.0f, sctx->temp - sctx->delta); - const float max_temp = sctx->temp + sctx->delta; - const float max_entropy = logf(data->logits->ne[0]); - - // Calculate the probabilities. - struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); - ggml_set_name(probs, "temp_ext_softmax_probs"); - - // Clamp probabilities to avoid log(0) which would give -inf - struct ggml_tensor * probs_clamped = ggml_clamp(ctx, probs, 1e-10f, 1.0f); - ggml_set_name(probs_clamped, "temp_ext_probs_clamped"); - - // Calculate the entropy, entropy = -Σ(p * log(p)). - struct ggml_tensor * log_probs = ggml_log(ctx, probs_clamped); - struct ggml_tensor * p_log_p = ggml_mul(ctx, probs_clamped, log_probs); - struct ggml_tensor * sum_p_log_p = ggml_sum(ctx, p_log_p); - struct ggml_tensor * entropy = ggml_scale(ctx, sum_p_log_p, -1.0f); - ggml_set_name(log_probs, "temp_ext_log_probs"); - ggml_set_name(p_log_p, "temp_ext_p_log_p"); - ggml_set_name(sum_p_log_p, "temp_ext_sum_p_log_p"); - ggml_set_name(entropy, "temp_ext_entropy"); - - // Normalize the entropy, norm_entropy = entropy / max_entropy - struct ggml_tensor * norm_entropy = ggml_scale(ctx, entropy, 1.0f / max_entropy); - ggml_set_name(norm_entropy, "temp_ext_norm_entropy"); - - // Calculate the dynamic temperature: - // dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent); - // - // Calculate powf(normalized_entropy, exponent) as - // norm_entropy^exponent = exp(exponent * log(norm_entropy)) - struct ggml_tensor * log_norm_entropy = ggml_log(ctx, norm_entropy); - struct ggml_tensor * scaled_log = ggml_scale(ctx, log_norm_entropy, sctx->exponent); - struct ggml_tensor * pow_entropy = ggml_exp(ctx, scaled_log); - // With pow_entropy computed we can now compute dyn_temp, scaling by - // (max_temp - min_temp) and then adding min_temp. - struct ggml_tensor * dyn_temp = ggml_scale_bias(ctx, pow_entropy, max_temp - min_temp, min_temp); - ggml_set_name(log_norm_entropy, "temp_ext_log_norm_entropy"); - ggml_set_name(scaled_log, "temp_ext_scaled_log"); - ggml_set_name(pow_entropy, "temp_ext_pow_entropy"); - ggml_set_name(dyn_temp, "temp_ext_dyn_temp"); - - // Scale the logits by the dynamic temperature - struct ggml_tensor * scaled_logits = ggml_div(ctx, data->logits, dyn_temp); - ggml_set_name(scaled_logits, "temp_ext_scaled_logits"); - - data->logits = scaled_logits; -} - -static struct llama_sampler_i llama_sampler_temp_ext_i = { - /* .name = */ llama_sampler_temp_ext_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_temp_ext_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_temp_ext_clone, - /* .free = */ llama_sampler_temp_ext_free, - /* .backend_init = */ llama_sampler_temp_ext_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_temp_ext_backend_apply, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { - const bool is_empty = temp == 1.0f && delta <= 0.0f; - - if (is_empty) { - return llama_sampler_init_empty("?temp-ext"); - } - - auto * res = llama_sampler_init( - /* .iface = */ &llama_sampler_temp_ext_i, - /* .ctx = */ new llama_sampler_temp_ext { - ("temp-ext"), - /* .temp = */ temp, - /* .delta = */ delta, - /* .exponent = */ exponent, - } - ); - - return res; -} - -// xtc - -struct llama_sampler_xtc { - const float probability; - const float threshold; - const size_t min_keep; - - const uint32_t seed; - uint32_t seed_cur; - - std::mt19937 rng; -}; - -static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { - return "xtc"; -} - -static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_xtc *) smpl->ctx; - - if (ctx->probability <= 0.0f - || ctx->threshold > 0.5f - || cur_p->size < 2) { - return; - } - - std::uniform_real_distribution distribution(0.0f, 1.0f); - float chance = distribution(ctx->rng); - if (chance > ctx->probability) { - return; - } - - llama_sampler_softmax_impl(cur_p, true); - - int pos_last = 0; - - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].p >= ctx->threshold) { - pos_last = i; - } else { - break; - } - } - - if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { - cur_p->data += pos_last; - cur_p->size -= pos_last; - } -} - -static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; - auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); - - // copy the state - { - auto * result_ctx = (llama_sampler_xtc *) result->ctx; - - result_ctx->rng = ctx->rng; - } - - return result; -} - -static void llama_sampler_xtc_free(struct llama_sampler * smpl) { - delete (llama_sampler_xtc *) smpl->ctx; -} - -static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_xtc *) smpl->ctx; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); -} - -static struct llama_sampler_i llama_sampler_xtc_i = { - /* .name = */ llama_sampler_xtc_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sample_xtc_apply, - /* .reset = */ llama_sampler_xtc_reset, - /* .clone = */ llama_sampler_xtc_clone, - /* .free = */ llama_sampler_xtc_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { - const bool is_empty = (p <= 0.0f || t > 0.5f); - - if (is_empty) { - return llama_sampler_init_empty("?xtc"); - } - - const auto seed_cur = get_rng_seed(seed); - - return llama_sampler_init( - /* .iface = */ &llama_sampler_xtc_i, - /* .ctx = */ new llama_sampler_xtc { - /* .probability = */ p, - /* .threshold = */ t, - /* .min_keep = */ min_keep, - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - } - ); -} - -// mirostat - -struct llama_sampler_mirostat { - const int32_t n_vocab; - - const uint32_t seed; - uint32_t seed_cur; - - const float tau; - const float eta; - - const int32_t m; - - float mu; - - std::mt19937 rng; -}; - -static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { - return "mirostat"; -} - -static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_mirostat *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p, true); - - // Estimate s_hat using the most probable m tokens - float s_hat = 0.0; - float sum_ti_bi = 0.0; - float sum_ti_sq = 0.0; - for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) { - float t_i = logf(float(i + 2) / float(i + 1)); - float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); - sum_ti_bi += t_i * b_i; - sum_ti_sq += t_i * t_i; - } - s_hat = sum_ti_bi / sum_ti_sq; - - // Compute k from the estimated s_hat and target surprise value - float epsilon_hat = s_hat - 1; - float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); - - llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); - - llama_sampler_softmax_impl(cur_p, true); - - const int idx = llama_sample_dist(cur_p, ctx->rng); - - cur_p->selected = idx; - - float observed_surprise = -log2f(cur_p->data[idx].p); - float e = observed_surprise - ctx->tau; - - // Update mu using the learning rate and error - ctx->mu = ctx->mu - ctx->eta * e; -} - -static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx; - auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m); - - // copy the state - { - auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx; - - result_ctx->mu = ctx->mu; - result_ctx->rng = ctx->rng; - } - - return result; -} - -static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_mirostat *) smpl->ctx; - ctx->mu = 2.0f*ctx->tau; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); -} - -static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { - delete (llama_sampler_mirostat *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_mirostat_i = { - /* .name = */ llama_sampler_mirostat_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_mirostat_apply, - /* .reset = */ llama_sampler_mirostat_reset, - /* .clone = */ llama_sampler_mirostat_clone, - /* .free = */ llama_sampler_mirostat_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { - const auto seed_cur = get_rng_seed(seed); - - return llama_sampler_init( - /* .iface = */ &llama_sampler_mirostat_i, - /* .ctx = */ new llama_sampler_mirostat { - /* .n_vocab = */ n_vocab, - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .tau = */ tau, - /* .eta = */ eta, - /* .m = */ m, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed_cur), - } - ); -} - -// mirostat v2 - -struct llama_sampler_mirostat_v2 { - const uint32_t seed; - uint32_t seed_cur; - - const float tau; - const float eta; - - float mu; - - std::mt19937 rng; -}; - -static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { - return "mirostat-v2"; -} - -static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p, true); - - // Truncate the words with surprise values greater than mu - cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { - return -log2f(candidate.p) > ctx->mu; - })); - - if (cur_p->size == 0) { - cur_p->size = 1; - } - - // Normalize the probabilities of the remaining words - llama_sampler_softmax_impl(cur_p, true); - - const int idx = llama_sample_dist(cur_p, ctx->rng); - - cur_p->selected = idx; - - float observed_surprise = -log2f(cur_p->data[idx].p); - float e = observed_surprise - ctx->tau; - - // Update mu using the learning rate and error - ctx->mu = ctx->mu - ctx->eta * e; -} - -static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; - ctx->mu = 2.0f*ctx->tau; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); -} - -static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx; - - auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta); - - // copy the state - { - auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx; - - result_ctx->mu = ctx->mu; - result_ctx->rng = ctx->rng; - } - - return result; -} - -static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) { - delete (llama_sampler_mirostat_v2 *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_mirostat_v2_i = { - /* .name = */ llama_sampler_mirostat_v2_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_mirostat_v2_apply, - /* .reset = */ llama_sampler_mirostat_v2_reset, - /* .clone = */ llama_sampler_mirostat_v2_clone, - /* .free = */ llama_sampler_mirostat_v2_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { - auto seed_cur = get_rng_seed(seed); - return llama_sampler_init( - /* .iface = */ &llama_sampler_mirostat_v2_i, - /* .ctx = */ new llama_sampler_mirostat_v2 { - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .tau = */ tau, - /* .eta = */ eta, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed_cur), - } - ); -} - -// grammar - -struct llama_sampler_grammar { - const struct llama_vocab * vocab; - - std::string grammar_str; - std::string grammar_root; - - struct llama_grammar * grammar; -}; - -static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) { - return "grammar"; -} - -static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (ctx->grammar) { - llama_grammar_accept_impl(*ctx->grammar, token); - } -} - -static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (ctx->grammar) { - llama_grammar_apply_impl(*ctx->grammar, cur_p); - } -} - -// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle. -static struct llama_sampler * llama_sampler_init_grammar_impl( - const struct llama_vocab * vocab, - const char * grammar_str, - const char * grammar_root, - bool lazy, - const char ** trigger_words, - size_t num_trigger_words, - const llama_token * trigger_tokens, - size_t num_trigger_tokens, - const char ** trigger_patterns, - size_t num_trigger_patterns); - -static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (!ctx->grammar) { - return; - } - - std::vector trigger_patterns_c; - trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size()); - for (auto & trigger_pattern : ctx->grammar->trigger_patterns) { - trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); - } - - auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(), - ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), - ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); - - llama_grammar_free_impl(ctx->grammar); - ctx->grammar = grammar_new; -} - -static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; - - auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0); - GGML_ASSERT(result); - - // copy the state - { - auto * result_ctx = (llama_sampler_grammar *) result->ctx; - - if (ctx->grammar) { - result_ctx->grammar_str = ctx->grammar_str; - result_ctx->grammar_root = ctx->grammar_root; - - result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar); - } - } - - return result; -} - -static void llama_sampler_grammar_free(struct llama_sampler * smpl) { - const auto * ctx = (llama_sampler_grammar *) smpl->ctx; - - if (ctx->grammar) { - llama_grammar_free_impl(ctx->grammar); - } - - delete ctx; -} - -static struct llama_sampler_i llama_sampler_grammar_i = { - /* .name = */ llama_sampler_grammar_name, - /* .accept = */ llama_sampler_grammar_accept_impl, - /* .apply = */ llama_sampler_grammar_apply, - /* .reset = */ llama_sampler_grammar_reset, - /* .clone = */ llama_sampler_grammar_clone, - /* .free = */ llama_sampler_grammar_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -static struct llama_sampler * llama_sampler_init_grammar_impl( - const struct llama_vocab * vocab, - const char * grammar_str, - const char * grammar_root, - bool lazy, - const char ** trigger_words, - size_t num_trigger_words, - const llama_token * trigger_tokens, - size_t num_trigger_tokens, - const char ** trigger_patterns, - size_t num_trigger_patterns) { - auto * ctx = new llama_sampler_grammar; - - if (grammar_str != nullptr && grammar_str[0] != '\0') { - std::string trigger_pattern; - llama_grammar * grammar = nullptr; - // TODO: remove trigger_words support. - if (trigger_words != nullptr && num_trigger_words > 0) { - GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0); - trigger_pattern = "[\\s\\S]*?("; - for (size_t i = 0; i < num_trigger_words; ++i) { - static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); - if (i > 0) { - trigger_pattern += "|"; - } - trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0"); - } - trigger_pattern += ")[\\s\\S]*"; - - std::array tmp_trigger_patterns = { trigger_pattern.c_str() }; - grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens); - } else { - grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens); - } - *ctx = { - /* .vocab = */ vocab, - /* .grammar_str = */ grammar_str, - /* .grammar_root = */ grammar_root, - /* .grammar = */ grammar, - }; - if (!ctx->grammar) { - delete ctx; - return nullptr; - } - } else { - *ctx = { - /* .vocab = */ vocab, - /* .grammar_str = */ {}, - /* .grammar_root = */ {}, - /* .grammar = */ nullptr, - }; - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_grammar_i, - /* .ctx = */ ctx - ); -} - -struct llama_sampler * llama_sampler_init_grammar( - const struct llama_vocab * vocab, - const char * grammar_str, - const char * grammar_root) { - return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0); -} - -struct llama_sampler * llama_sampler_init_grammar_lazy( - const struct llama_vocab * vocab, - const char * grammar_str, - const char * grammar_root, - const char ** trigger_words, - size_t num_trigger_words, - const llama_token * trigger_tokens, - size_t num_trigger_tokens) { - return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0); -} - -struct llama_sampler * llama_sampler_init_grammar_lazy_patterns( - const struct llama_vocab * vocab, - const char * grammar_str, - const char * grammar_root, - const char ** trigger_patterns, - size_t num_trigger_patterns, - const llama_token * trigger_tokens, - size_t num_trigger_tokens) { - return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns); -} - -// penalties - -struct llama_sampler_penalties { - const int32_t penalty_last_n; - const float penalty_repeat; - const float penalty_freq; - const float penalty_present; - - ring_buffer prev; - - // a frequency map to count token occurrences - std::unordered_map token_count; -}; - -static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) { - return "penalties"; -} - -static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - if (ctx->penalty_last_n == 0) { - return; - } - - ctx->token_count[token]++; - - // if the ring buffer is full, remove the oldest token - if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) { - const auto old = ctx->prev.front(); - - ctx->token_count[old]--; - if (ctx->token_count[old] == 0) { - ctx->token_count.erase(old); - } - } - - ctx->prev.push_back(token); - -#if 0 - // sanity check - std::unordered_map tmp; - for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { - tmp[ctx->prev.rat(i)]++; - } - - assert(ctx->token_count == tmp); -#endif -} - -static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - - if ((ctx->penalty_last_n == 0) || - (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { - return; - } - - // Apply frequency and presence penalties to the cur_p - for (size_t i = 0; i < cur_p->size; ++i) { - const auto token_iter = ctx->token_count.find(cur_p->data[i].id); - if (token_iter == ctx->token_count.end()) { - continue; - } - - const int count = token_iter->second; - - assert(count > 0 && count <= ctx->penalty_last_n); - - // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. - // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (cur_p->data[i].logit <= 0) { - cur_p->data[i].logit *= ctx->penalty_repeat; - } else { - cur_p->data[i].logit /= ctx->penalty_repeat; - } - - cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present; - } - - cur_p->sorted = false; -} - -static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - ctx->prev.clear(); - ctx->token_count.clear(); -} - -static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; - auto * result = llama_sampler_init_penalties( - ctx->penalty_last_n, - ctx->penalty_repeat, - ctx->penalty_freq, - ctx->penalty_present); - - // copy the state - { - auto * result_ctx = (llama_sampler_penalties *) result->ctx; - - result_ctx->prev = ctx->prev; - } - - return result; -} - -static void llama_sampler_penalties_free(struct llama_sampler * smpl) { - delete (llama_sampler_penalties *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_penalties_i = { - /* .name = */ llama_sampler_penalties_name, - /* .accept = */ llama_sampler_penalties_accept, - /* .apply = */ llama_sampler_penalties_apply, - /* .reset = */ llama_sampler_penalties_reset, - /* .clone = */ llama_sampler_penalties_clone, - /* .free = */ llama_sampler_penalties_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_penalties( - int32_t penalty_last_n, - float penalty_repeat, - float penalty_freq, - float penalty_present) { - penalty_last_n = std::max(penalty_last_n, 0); - - const bool is_empty = (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)); - - if (is_empty) { - return llama_sampler_init_empty("?penalties"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_penalties_i, - /* .ctx = */ new llama_sampler_penalties { - /* .penalty_last_n = */ penalty_last_n, - /* .penalty_repeat = */ penalty_repeat, - /* .penalty_freq = */ penalty_freq, - /* .penalty_present = */ penalty_present, - /* .prev = */ ring_buffer(penalty_last_n), - /* .token_count = */ {}, - } - ); -} - -// top-n-sigma - -struct llama_sampler_top_n_sigma { - const float n; -}; - -static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) { - return "top-n-sigma"; -} - -static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx; - - if (ctx->n <= 0.0f || cur_p->size <= 1) { - return; - } - - // find max logit and calculate mean - float max = cur_p->data[0].logit; - float logits_sum = 0; - size_t valid_count = 0; - for (size_t i = 0; i < cur_p->size; ++i) { - // Only count non-negative infinity values - if (cur_p->data[i].logit != -INFINITY) { - max = std::max(max, cur_p->data[i].logit); - logits_sum += cur_p->data[i].logit; - valid_count++; - } - } - float mean = valid_count > 0 ? logits_sum/valid_count : 0; - - // calculate standard deviation - float acc = 0; - for (size_t i = 0; i < cur_p->size; ++i) { - // Skip -infinity in std calculation - if (cur_p->data[i].logit != -INFINITY) { - acc += pow(cur_p->data[i].logit - mean, 2); - } - } - float std = valid_count > 0 ? sqrt(acc/valid_count) : 0; - - // apply mask - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit < max - (ctx->n * std)) { - cur_p->data[i].logit = -INFINITY; - } - } - - llama_sampler_softmax_impl(cur_p, true); -} - -static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx; - return llama_sampler_init_top_n_sigma(ctx->n); -} - -static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) { - delete (llama_sampler_top_n_sigma *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_top_n_sigma_i = { - /* .name = */ llama_sampler_top_n_sigma_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_top_n_sigma_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_top_n_sigma_clone, - /* .free = */ llama_sampler_top_n_sigma_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_top_n_sigma(float n) { - const bool is_empty = (n <= 0.0f); - - if (is_empty) { - return llama_sampler_init_empty("?top-n-sigma"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_top_n_sigma_i, - /* .ctx = */ new llama_sampler_top_n_sigma { - /* .n = */ n, - } - ); -} - -// DRY - -struct llama_sampler_dry { - int32_t total_context_size; - - const float dry_multiplier; - const float dry_base; - const int32_t dry_allowed_length; - const int32_t dry_penalty_last_n; - - std::unordered_multimap> dry_processed_breakers; - std::vector dry_repeat_count; - std::unordered_map dry_max_token_repeat; - ring_buffer last_tokens; -}; - -// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) -static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { - for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) { - std::string word = vocab.detokenize({token_id}, true); - if (word.find(str) != std::string::npos) { - token_sequences.emplace(token_id, std::vector()); - } else { - size_t word_len = word.size(); - size_t str_len = str.size(); - size_t pos = -1; - while ((pos = word.find(str[0], pos + 1)) != std::string::npos) { - bool match = true; - size_t i; - for (i = 1; i < str_len && i + pos < word_len; ++i) { - if (word[pos + i] != str[i]) { - match = false; - break; - } - } - if (match) { - std::vector tokenization = vocab.tokenize(str.substr(i), false, false); - if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) { - tokenization.resize(max_tail_len); - } - - // Ensure we don't already have a duplicate matching tokenization - auto its = token_sequences.equal_range(token_id); - bool found = false; - for (auto it = its.first; it != its.second; ++it) { - if (tokenization == it->second) { - found = true; - break; - } - } - if (!found) { - token_sequences.emplace(token_id, tokenization); - } - } - } - } - } -} - -static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) { - return "dry"; -} - -static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_dry *) smpl->ctx; - if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { - return; - } - - ctx->last_tokens.push_back(token); -} - -// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) -static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_dry *) smpl->ctx; - - if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { - return; - } - - int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0); - int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size); - - if (last_n_repeat <= ctx->dry_allowed_length) { - return; - } - - ctx->dry_repeat_count.assign(last_n_repeat, 0); - ctx->dry_max_token_repeat.clear(); - - // Step 1: Look for restart sequences to limit the maximum repetition length. - // Work backwards through the context looking for any token that begins a restart sequence. - // - // The collection `restart_sequences` is a mapping from a "head" token to all "tail" - // sequences that together comprise a restart sequence. This allows us to quickly check - // whether each token is the head of a complete sequence. Most restart sequences are actually - // a single token, and for these the "tail" is an empty vector. - // - // If the token is a "head", test all restart sequences that begin with this token - // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and - // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The - // longest matching sequence (if any) is used to limit the maximum repetition length. - // - // Note that in the case case of a short sequence contained in a longer one, this might fail to - // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as - // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress - // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare. - // - // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we - // have already clamped the maximum tail sequence length when generating `restart_sequences`. - // With clamping, this scan is O(N) in the context length. - - int rep_limit = last_n_repeat; - for (int i = 0; i < last_n_repeat; ++i) { - llama_token token = ctx->last_tokens.rat(i); - auto its = ctx->dry_processed_breakers.equal_range(token); - if (its.first == ctx->dry_processed_breakers.end()) { - continue; - } - int longest_match = -1; - for (auto it = its.first; it != its.second; ++it) { - // Note that (*it) does not contain the head character, so seq_len will be - // the restart sequence length minus 1. - // In the common case of a single-token restart sequence, (*it) will be empty - // and we will trivially match. - int seq_len = (int)it->second.size(); - if (seq_len > longest_match && seq_len <= (int)i) { - bool match = true; - for (int offset = 0; offset < seq_len; ++offset) { - // The -1 when indexing `last_tokens` is because we already matched the head. - if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) { - match = false; - break; - } - } - if (match) { - longest_match = seq_len; - } - } - } - if (longest_match >= 0) { - // We found a restart sequence starting `i` tokens from the end and continuing for - // `longest_match` tokens. - rep_limit = i - longest_match; - break; - } - } - if (rep_limit < ctx->dry_allowed_length) { - return; - } - - // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in - // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing - // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences. - // - // This algorithm is not currently documented on Wikipedia, but there is a clear description here: - // https://ivanyu.me/blog/2014/10/15/z-algorithm/ - // - // The code below is adapted from the public domain implementation by the same author here: - // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py - // - // Example: - // Last N tokens: a b c c b c y a b c - // Repeat counts: 0 0 3 1 0 2 0 0 0 0 - // ^ - // This `3` means that the last three tokens of the context (a b c) also appear here. - // - // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested - // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each - // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables - // ensure that the inner while loops only examine each token in the context once as the outer - // for loop iterates over the context. - - { - const int last = last_n_repeat - 1; - - int rt = 0; - int lt = 0; - - for (int k = 1; k < last_n_repeat; ++k) { - if (k > rt) { - // If k is outside the current Z-box, do naive computation. - int n = 0; - while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) { - ++n; - } - ctx->dry_repeat_count[last - k] = std::min(n, rep_limit); - if (n > 0) { - lt = k; - rt = k + n - 1; - } - } else { - // If k is inside the current Z-box, consider two cases. - - int p = k - lt; // Pair index. - int right_part_len = rt - k + 1; - - if (ctx->dry_repeat_count[last - p] < right_part_len) { - int n = std::min(ctx->dry_repeat_count[last - p], rep_limit); - ctx->dry_repeat_count[last - k] = n; - } else { - int i = rt + 1; - while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) { - i += 1; - } - - int n = std::min(i - k, rep_limit); - ctx->dry_repeat_count[last - k] = n; - lt = k; - rt = i - 1; - } - } - } - } - - // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length - // that would be generated by emitting each new token that would extend a sequence. - // - // Following the same example as above: - // Last N tokens: a b c c b c y a b c - // Repeat counts: 0 0 3 1 0 2 0 0 0 0 - // - // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition. - // c: 3 -> 4 (from `a b c` to `a b c c`) - // b: 1 -> 2 (from `c` to `c b`) - // y: 2 -> 3 (from `b c` to `b c y`) - - for (int i = 0; i < last_n_repeat - 1; ++i) { - int repeat_len = ctx->dry_repeat_count[i]; - if (repeat_len >= ctx->dry_allowed_length) { - // This token ends a repeat, so the next token would continue one. - // By convention, the value of `repeat_len` only includes the tokens currently - // in the context, not the new token that would be added. - llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i); - // Track the maximum sequence ending in this token. - const auto& it = ctx->dry_max_token_repeat.find(token); - if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) { - ctx->dry_max_token_repeat[token] = repeat_len; - } - } - } - - // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens. - - // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`. - // Compute it from `penalty_base` and the approximate log of `std::numeric_limits::max()` - const float FLOAT_MAX_LOG = 88.7228391f; - int max_exponent = 0; - if (ctx->dry_base > 1.000001f) { - max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base); - } - - for (size_t i = 0; i < cur_p->size; ++i) { - const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id); - if (af_kvp != ctx->dry_max_token_repeat.end()) { - // Check all sequence breakers starting with this token - auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id); - bool is_single_token_breaker = false; - - for (auto it = range.first; it != range.second; ++it) { - if (it->second.empty()) { - is_single_token_breaker = true; - break; - } - } - - // Apply penalty only if it's not a single-token sequence breaker - if (!is_single_token_breaker) { - int repeat_exp = af_kvp->second - ctx->dry_allowed_length; - if (max_exponent > 0 && repeat_exp > max_exponent) { - repeat_exp = max_exponent; - } - float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp); - cur_p->data[i].logit -= penalty; - } - } - } - - cur_p->sorted = false; -} - -static void llama_sampler_dry_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_dry *) smpl->ctx; - ctx->last_tokens.clear(); - ctx->dry_repeat_count.clear(); - ctx->dry_max_token_repeat.clear(); -} - -static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) { - const auto * ctx = (llama_sampler_dry *) smpl->ctx; - - llama_vocab dummy_vocab; - - // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying - auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); - - // Copy the state, including the processed breakers - { - auto * result_ctx = (llama_sampler_dry *) result->ctx; - result_ctx->dry_processed_breakers = ctx->dry_processed_breakers; - result_ctx->dry_repeat_count = ctx->dry_repeat_count; - result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat; - result_ctx->last_tokens = ctx->last_tokens; - } - - return result; -} - -static void llama_sampler_dry_free(struct llama_sampler * smpl) { - delete (llama_sampler_dry *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_dry_i = { - /* .name = */ llama_sampler_dry_name, - /* .accept = */ llama_sampler_dry_accept, - /* .apply = */ llama_sampler_dry_apply, - /* .reset = */ llama_sampler_dry_reset, - /* .clone = */ llama_sampler_dry_clone, - /* .free = */ llama_sampler_dry_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) { - int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0); - std::unordered_multimap> processed_breakers; - const int MAX_CHAR_LEN = 40; - const int MAX_SEQ_LEN = 20; - - const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0); - - if (!dry_enabled) { - return llama_sampler_init_empty("?dry"); - } - - if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) { - // Process sequence breakers - for (size_t i = 0; i < num_breakers; ++i) { - if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) { - LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i); - continue; - } - - std::string sequence_break(seq_breakers[i]); - if (sequence_break.empty()) { - LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n"); - continue; - } - - if (sequence_break.size() > MAX_CHAR_LEN) { - LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN); - sequence_break.resize(MAX_CHAR_LEN); - } - - get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN); - } - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_dry_i, - /* .ctx = */ new llama_sampler_dry { - /* .total_context_size = */ n_ctx_train, - /* .dry_multiplier = */ dry_multiplier, - /* .dry_base = */ dry_base, - /* .dry_allowed_length = */ dry_allowed_length, - /* .dry_penalty_last_n = */ dry_penalty_last_n, - /* .dry_processed_breakers = */ std::move(processed_breakers), - /* .dry_repeat_count = */ dry_enabled ? std::vector(effective_dry_penalty_last_n, 0) : std::vector{}, - /* .dry_max_token_repeat = */ {}, - /* .last_tokens = */ dry_enabled ? ring_buffer(effective_dry_penalty_last_n) : ring_buffer(0), - } - ); -} - -// wrapper for test-sampling.cpp -struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector>& seq_breakers) { - llama_vocab dummy_vocab; - auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0); - auto * ctx = (llama_sampler_dry *) result->ctx; - - // Process the token-based sequence breakers - ctx->dry_processed_breakers.clear(); - if (seq_breakers.empty()) { - LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n"); - } else { - for (const auto& breaker : seq_breakers) { - if (breaker.empty()) { - LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n"); - continue; - } - llama_token head_token = breaker[0]; - std::vector tail_tokens(breaker.begin() + 1, breaker.end()); - ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens)); - } - - if (ctx->dry_processed_breakers.empty()) { - LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n"); - } - } - - return result; -} - -// adaptive-p sampler state -// -// maintains an exponential moving average of the *ORIGINAL* probabilities -// of selected tokens, used to compute an adapted target at each sampling step. -// -// see llama.h for a full description of the sampler -// -// ref: https://github.com/ggml-org/llama.cpp/pull/17927 -// -struct llama_sampler_adaptive_p { - const float target; // target probability (0.0 - 1.0; negative = disabled) - const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) - const uint32_t seed; // original RNG seed - uint32_t seed_cur; // actual RNG seed - std::mt19937 rng; // RNG state - float weighted_sum; // sum(p_i * decay^i) - float total_weight; // sum(decay^i), converges to 1/(1-decay) - std::vector original_probs; // pre-transform probs, cached for EMA update - llama_token pending_token_id; // token ID of selected token - int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs -}; - -// adaptive probability transformation constants -static constexpr float DISTRIBUTION_WIDTH = 0.3f; -static constexpr float PEAK_LOGIT_VALUE = 5.0f; -static constexpr float SHARPNESS = 10.0f; -static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; - -static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) { - return "adaptive-p"; -} - -static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p, false); - - if (ctx->target < 0.0f) { - // at negative target values, adaptive-p is no-op - // we simply sample from the existing distribution - cur_p->selected = llama_sample_dist(cur_p, ctx->rng); - return; - } - - // store the original probabilities - ctx->original_probs.resize(cur_p->size); - for (size_t i = 0; i < cur_p->size; ++i) { - ctx->original_probs[i] = cur_p->data[i].p; - } - - // using the EMA, compute the adapted target probability for the current sampling step - auto target = std::clamp(ctx->target, 0.0f, 1.0f); - float adapted_target = std::clamp( - ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight), - 0.0f, 1.0f - ); - - // adaptive probability transform - // - // quadratic near target for fine differentiation, transitioning to linear decay in the - // tails. unbounded negative logits ensure proper suppression of far-from-target tokens - // after the softmax. - // - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit == -INFINITY) { - // don't transform logits that are -INFINITY - // (as masked out by e.g. min-p and top-p when using backend sampling) - continue; - } - float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); - cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist); - } - - // softmax and sample from the transformed distribution - llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); - cur_p->selected = idx; - - // store the selected token ID for acceptance later - ctx->pending_token_id = cur_p->data[idx].id; - ctx->pending_token_idx = idx; -} - -static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; - if (ctx->pending_token_id == token) { - GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL); - GGML_ASSERT(ctx->pending_token_idx != -1); - // update EMA with the original probability of the selected token - ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum; - ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; - } - ctx->pending_token_id = LLAMA_TOKEN_NULL; - ctx->pending_token_idx = -1; -} - -static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; - // ctx->target and ctx->decay never change after init, so it's safe to keep them as is. - // original_probs is completely overwritten on every call to _apply. - // so we only need to reset the EMA state and pending token. - ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); - ctx->total_weight = 1.0f / (1.0f - ctx->decay); - ctx->pending_token_id = LLAMA_TOKEN_NULL; - ctx->pending_token_idx = -1; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); -} - -static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx; - auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); - auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; - - // copy everything (target, decay, seed, and RNG are already set) - result_ctx->weighted_sum = ctx->weighted_sum; - result_ctx->total_weight = ctx->total_weight; - result_ctx->pending_token_id = ctx->pending_token_id; - result_ctx->pending_token_idx = ctx->pending_token_idx; - - return result; -} - -static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { - delete (llama_sampler_adaptive_p *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_adaptive_p_i = { - /* .name = */ llama_sampler_adaptive_p_name, - /* .accept = */ llama_sampler_adaptive_p_accept, - /* .apply = */ llama_sampler_adaptive_p_apply, - /* .reset = */ llama_sampler_adaptive_p_reset, - /* .clone = */ llama_sampler_adaptive_p_clone, - /* .free = */ llama_sampler_adaptive_p_free, - /* .backend_init = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ nullptr, - /* .backend_set_input = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_adaptive_p( - float target, - float decay, - uint32_t seed -) { - auto seed_cur = get_rng_seed(seed); - float clamped_decay = std::clamp(decay, 0.0f, 0.99f); - return llama_sampler_init( - /* .iface = */ &llama_sampler_adaptive_p_i, - /* .ctx = */ new llama_sampler_adaptive_p { - /* .target = */ target, - /* .decay = */ clamped_decay, - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - /* .weighted_sum = */ target / (1.0f - clamped_decay), - /* .total_weight = */ 1.0f / (1.0f - clamped_decay), - /* .original_probs = */ {}, - /* .pending_token_id = */ LLAMA_TOKEN_NULL, - /* .pending_token_idx = */ -1 - } - ); -} - -// logit-bias - -struct llama_sampler_logit_bias : public llama_sampler_backend { - const int32_t n_vocab; - - const std::vector logit_bias; - - std::vector to_search; - - struct ggml_tensor * inp_logit_bias; - struct ggml_tensor * inp_logit_idxs; -}; - -static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; - return ctx->get_name(); -} - -static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; - - if (ctx->logit_bias.empty()) { - return; - } - - ctx->to_search.clear(); - - // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) - for (const auto & lb : ctx->logit_bias) { - if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) { - cur_p->data[lb.token].logit += lb.bias; - } else { - ctx->to_search.push_back(lb); - } - } - - if (ctx->to_search.empty()) { - return; - } - - // search for the remaining candidates that were not found in the previous step - for (size_t i = 0; i < cur_p->size; ++i) { - for (const auto & lb : ctx->to_search) { - if (cur_p->data[i].id == lb.token) { - cur_p->data[i].logit += lb.bias; - break; - } - } - } -} - -static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; - return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); -} - -static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) { - delete (llama_sampler_logit_bias *) smpl->ctx; -} - -static void llama_sampler_logit_bias_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(gf); - GGML_UNUSED(ctx); - - auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; - if (sctx->logit_bias.empty()) { - return; - } - - const size_t n = sctx->logit_bias.size(); - - sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n); - ggml_set_name(sctx->inp_logit_bias, "logit_bias"); - ggml_set_input(sctx->inp_logit_bias); - - sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n); - ggml_set_name(sctx->inp_logit_idxs, "logit_idxs"); - ggml_set_input(sctx->inp_logit_idxs); - - ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f); - - cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur)); - cur = ggml_set_rows(ctx, cur, sctx->inp_logit_bias, sctx->inp_logit_idxs); - cur = ggml_reshape_1d(ctx, cur, ggml_nelements(cur)); - - data->logits = ggml_add(ctx, data->logits, cur); -} - -static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; - if (sctx->logit_bias.empty()) { - return; - } - - GGML_ASSERT(sctx->inp_logit_bias != nullptr); - GGML_ASSERT(sctx->inp_logit_idxs != nullptr); - - const size_t n = sctx->logit_bias.size(); - - std::vector data_logit_bias(n, 0.0f); - std::vector data_logit_idxs(n, 0); - for (size_t i = 0; i < n; ++i) { - const auto & lb = sctx->logit_bias[i]; - GGML_ASSERT(lb.token >= 0 && lb.token < (int32_t) sctx->n_vocab); - data_logit_bias[i] = lb.bias; - data_logit_idxs[i] = lb.token; - } - - ggml_backend_tensor_set(sctx->inp_logit_bias, data_logit_bias.data(), 0, ggml_nbytes(sctx->inp_logit_bias)); - ggml_backend_tensor_set(sctx->inp_logit_idxs, data_logit_idxs.data(), 0, ggml_nbytes(sctx->inp_logit_idxs)); -} - -static bool llama_sampler_logit_bias_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - - auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; - - sctx->init(true); - - if (sctx->logit_bias.empty()) { - return true; - } - - return true; -} - -static struct llama_sampler_i llama_sampler_logit_bias_i = { - /* .name = */ llama_sampler_logit_bias_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_logit_bias_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_logit_bias_clone, - /* .free = */ llama_sampler_logit_bias_free, - /* .backend_init = */ llama_sampler_logit_bias_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_logit_bias_backend_apply, - /* .backend_set_input = */ llama_sampler_logit_bias_backend_set_input, -}; - -struct llama_sampler * llama_sampler_init_logit_bias( - int32_t n_vocab, - int32_t n_logit_bias, - const llama_logit_bias * logit_bias) { - const bool is_empty = n_logit_bias <= 0; - - if (is_empty) { - return llama_sampler_init_empty("?logit-bias"); - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_logit_bias_i, - /* .ctx = */ new llama_sampler_logit_bias { - ("logit-bias"), - /* .n_vocab = */ n_vocab, - /* .logit_bias = */ std::vector(logit_bias, logit_bias + n_logit_bias), - /* .to_search = */ {}, - /* .inp_logit_bias = */ nullptr, - /* .inp_logit_idxs = */ nullptr, - } - ); -} - -// infill - -//#define GGML_DEBUG_SAMPLER_INFILL - -struct llama_sampler_infill { - const struct llama_vocab * vocab; - - std::vector buf0; - std::vector buf1; -}; - -static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { - return "infill"; -} - -static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_infill *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p, true); - -#if defined(GGML_DEBUG_SAMPLER_INFILL) -#define LOG_DBG_CUR LLAMA_LOG_DEBUG -#else -#define LOG_DBG_CUR(...) -#endif - - for (size_t i = 0; i < cur_p->size; ++i) { - LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); - } - - float p_txt_sum = 0.0f; - float p_eog_sum = 0.0f; - - for (size_t i = 0; i < cur_p->size; ++i) { - if (ctx->vocab->is_eog(cur_p->data[i].id)) { - p_eog_sum += cur_p->data[i].p; - } else { - p_txt_sum += cur_p->data[i].p; - } - } - - const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); - - LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); - - if (3*p_eog_sum*cur_p->size > p_txt_sum) { - LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); - - // keep just the EOG tokens - const auto size_org = cur_p->size; - - cur_p->size = 0; - - float p_sum = 0.0f; - - for (size_t i = 0; i < size_org; ++i) { - if (ctx->vocab->is_eog(cur_p->data[i].id)) { - p_sum += cur_p->data[i].p; - - cur_p->data[cur_p->size++] = cur_p->data[i]; - } - } - - // normalize probs - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= p_sum; - } - - return; - } - - size_t n_combined = 0; GGML_UNUSED(n_combined); - - // combine tokens with common prefix - for (size_t i0 = 0; i0 < cur_p->size; ++i0) { - for (size_t i1 = 0; i1 < cur_p->size; ++i1) { - if (cur_p->data[i0].logit == -INFINITY) { - break; - } - - if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) { - continue; - } - - int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); - if (len0 < 0) { - ctx->buf0.resize(len0); - len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false); - assert(len0 > 0); - } - - int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); - if (len1 < 0) { - ctx->buf1.resize(len1); - len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false); - assert(len1 > 0); - } - - // token i0 is a prefix of token i1 - if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) { - int dst = i0; - int src = i1; - - // merge into the token with higher probability - if (cur_p->data[i1].p > cur_p->data[i0].p) { - std::swap(dst, src); - } - - cur_p->data[dst].p += cur_p->data[src].p; - cur_p->data[src].logit = -INFINITY; - cur_p->data[src].p = 0.0f; - - n_combined++; - } - } - } - - size_t n_non_eog = 0; - - size_t size_org = cur_p->size; - - float p_sum = 0.0f; - float thold = 0.2f; - - cur_p->size = 0; - - LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); - - for (size_t i = 0; i < size_org; ++i) { - const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id); - - if (cur_p->data[i].p < thold && !is_eog) { - continue; - } - - if (!is_eog) { - ++n_non_eog; - } - - p_sum += cur_p->data[i].p; - - // keep this token - cur_p->data[cur_p->size++] = cur_p->data[i]; - } - - LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); - - // if no non-EOG tokens are left -> reduce cur_p to single EOT token - if (n_non_eog == 0) { - cur_p->size = 1; - cur_p->data[0].id = ctx->vocab->token_eot(); - if (cur_p->data[0].id == LLAMA_TOKEN_NULL) { - cur_p->data[0].id = ctx->vocab->token_eos(); - } - cur_p->data[0].logit = 1.0f; - - GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL); - - return; - } - - // normalize probs - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= p_sum; - - LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); - } - - size_org = cur_p->size; - p_sum = 0.0f; - thold = 1.0/(n_non_eog + 1); - - cur_p->size = 0; - - LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); - - for (size_t i = 0; i < size_org; ++i) { - const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id); - - if (cur_p->data[i].p < thold && !is_eog) { - continue; - } - - p_sum += cur_p->data[i].p; - - cur_p->data[cur_p->size++] = cur_p->data[i]; - } - - // normalize probs - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= p_sum; - - LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); - } - -#undef LOG_DBG_CUR -} - -static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_infill *) smpl->ctx; - return llama_sampler_init_infill(ctx->vocab); -} - -static void llama_sampler_infill_free(struct llama_sampler * smpl) { - delete (llama_sampler_infill *) smpl->ctx; -} - -static struct llama_sampler_i llama_sampler_infill_i = { - /* .name = */ llama_sampler_infill_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_infill_apply, - /* .reset = */ nullptr, - /* .clone = */ llama_sampler_infill_clone, - /* .free = */ llama_sampler_infill_free, - /* .backend_apply = */ nullptr, - /* .backend_accept = */ nullptr, - /* .backend_set_input = */ nullptr, - /* .backend_init = */ nullptr, -}; - -struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) { - return llama_sampler_init( - /* .iface = */ &llama_sampler_infill_i, - /* .ctx = */ new llama_sampler_infill { - /* .vocab = */ vocab, - /* .buf0 = */ std::vector(512), - /* .buf1 = */ std::vector(512), - } - ); -} - -// utils - -uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { - if (smpl->iface == &llama_sampler_dist_i) { - return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; - } - - if (smpl->iface == &llama_sampler_mirostat_i) { - return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; - } - - if (smpl->iface == &llama_sampler_mirostat_v2_i) { - return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur; - } - - if (smpl->iface == &llama_sampler_chain_i) { - const auto * ctx = (const llama_sampler_chain *) smpl->ctx; - for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) { - const uint32_t seed = llama_sampler_get_seed(it->ptr); - if (seed != LLAMA_DEFAULT_SEED) { - return seed; - } - } - } - - return LLAMA_DEFAULT_SEED; -} - -// perf - -struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { - struct llama_perf_sampler_data data = {}; - - if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { - GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); - } - - const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; - - data.t_sample_ms = 1e-3 * ctx->t_sample_us; - data.n_sample = std::max(0, ctx->n_sample); - - return data; -} - -void llama_perf_sampler_print(const struct llama_sampler * chain) { - const auto data = llama_perf_sampler(chain); - - LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample); -} - -void llama_perf_sampler_reset(struct llama_sampler * chain) { - if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { - GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); - } - - auto * ctx = (struct llama_sampler_chain *) chain->ctx; - - ctx->t_sample_us = 0; - ctx->n_sample = 0; -} diff --git a/src/llama-sampling.h b/src/llama-sampling.h deleted file mode 100644 index 6a963c0bb..000000000 --- a/src/llama-sampling.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ? - -#include "llama.h" - -#include - -struct llama_vocab; -struct llama_grammar; - -// sampler chain - -struct llama_sampler_chain { - llama_sampler_chain_params params; - - // has .backend_init() been called? - bool is_init = false; - - struct info { - bool is_backend; - - llama_sampler * ptr; - }; - - std::vector samplers; - - // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations - std::vector cur; - - // timing - - mutable int64_t t_sample_us; - - mutable int32_t n_sample; -}; - -struct llama_sampler * llama_sampler_init_dry_testing( - int32_t context_size, - float dry_multiplier, - float dry_base, - int32_t dry_allowed_length, - int32_t dry_penalty_last_n, - const std::vector> & seq_breakers);