From: Georgi Gerganov Date: Wed, 3 Sep 2025 15:16:26 +0000 (+0300) Subject: sampling : optimize dist sampler (#15704) X-Git-Tag: upstream/0.0.6527~159 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=cdedb70a998cea7052560fe0b0615a839443564d;p=pkg%2Fggml%2Fsources%2Fllama.cpp sampling : optimize dist sampler (#15704) ggml-ci --- diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e8c0fc34..2186f827 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -604,10 +604,73 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl* static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - // sorting is not necessary here - llama_sampler_softmax_impl(cur_p, false); + // edge cases + if (cur_p->size == 0) { + cur_p->selected = -1; + return; + } + + cur_p->selected = 0; + + if (cur_p->size == 1) { + cur_p->data[0].p = 1.0f; + return; + } + + // max logit for numerical stability + float max_l = cur_p->data[0].logit; + if (!cur_p->sorted) { + for (size_t i = 1; i < cur_p->size; ++i) { + max_l = std::max(max_l, cur_p->data[i].logit); + } + } + + // apply softmax to obtain the probabilities + double sum_cum = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + sum_cum += p; + } + +#if 1 + // sample from the obtained probabilities and normalize the probs in a single pass + // this is ~3x faster on Mac with full gpt-oss vocab than the version below + // + std::uniform_real_distribution dist(0.0f, 1.0f); + const double rnd = dist(ctx->rng); + + double sum_run = 0.0f; + const double sum_tgt = sum_cum*rnd; + + bool found = false; + for (size_t i = 0; i < cur_p->size; ++i) { + if (!found) { + // accumulate probs until we reach the target sum + sum_run += cur_p->data[i].p; + if (sum_run >= sum_tgt) { + cur_p->selected = i; + found = true; + } + } + + // normalize probs + cur_p->data[i].p /= sum_cum; + } + + // fallback to the last token (don't think this can happen) + assert(found); + if (!found) { + cur_p->selected = cur_p->size - 1; + } +#else + // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= sum_cum; + } cur_p->selected = llama_sample_dist(cur_p, ctx->rng); +#endif } static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {