GGML_ASSERT(false && "unknown mirostat version");
}
} else {
- llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+ if (params.n_probs > 0) {
+ // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
+ // ref: https://github.com/ggerganov/llama.cpp/pull/9605
+ //
+ // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
+ // it is much faster, since we avoid sorting all tokens and should give a good approximation
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
+ llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+ }
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
}
int main(int argc, char ** argv) {
gpt_params params;
+ // needed to get candidate probs even for temp <= 0.0
+ params.sparams.n_probs = 128;
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1;
}
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_split = params.p_split;
- std::default_random_engine rng(params.sparams.seed);
+ std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
std::uniform_real_distribution<> u_dist;
// init llama.cpp
#include "ggml.h"
#include "llama.h"
-#include "llama-sampling.h"
#ifdef NDEBUG
#undef NDEBUG
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
}
+static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
+ std::vector<llama_token_data> cur(data.size());
+ std::copy(data.begin(), data.end(), cur.begin());
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+ llama_sampler_apply(cnstr, &cur_p);
+ llama_sampler_reset(cnstr);
+ const int64_t t_start = ggml_time_us();
+ for (int i = 0; i < n_iter; i++) {
+ std::copy(data.begin(), data.end(), cur.begin());
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+ llama_sampler_apply(cnstr, &cur_p);
+ llama_sampler_reset(cnstr);
+ }
+ const int64_t t_end = ggml_time_us();
+ llama_sampler_free(cnstr);
+ printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+}
+
+#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
+
+static void test_perf() {
+ const int n_vocab = 1 << 17;
+
+ std::vector<llama_token_data> data;
+
+ data.reserve(n_vocab);
+ for (int i = 0; i < n_vocab; i++) {
+ const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
+ data.emplace_back(llama_token_data{i, logit, 0.0f});
+ }
+
+ BENCH(llama_sampler_init_top_k (40), data, 32);
+ BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
+ BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
+ BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
+ BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
+ BENCH(llama_sampler_init_softmax (), data, 32);
+}
+
int main(void) {
ggml_time_init();
printf("OK\n");
+ test_perf();
+
return 0;
}