From: Kevin Wang Date: Mon, 8 Jul 2024 07:26:53 +0000 (-0400) Subject: common : preallocate sampling token data vector (#8363) X-Git-Tag: upstream/0.0.4488~1145 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=470939d483d1c89b7292f78bac1fd27c42c171ce;p=pkg%2Fggml%2Fsources%2Fllama.cpp common : preallocate sampling token data vector (#8363) `emplace_back` repeatedly-called is slower than preallocating the vector to the vocab size and directly inserting the data. Some rudimentary profiling with `chrono` improves the performance of this block of code from ~500us/op to ~40us/op. Overall, this slightly improves the sampling performance which has a more substantial impact for the `examples/lookahead` implementation -- I am able to see a ~10% performance boost in lookahead inference. --- diff --git a/common/sampling.cpp b/common/sampling.cpp index e9db2715..6a483c81 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl( if (ctx_sampling->grammar != NULL && !apply_grammar) { GGML_ASSERT(original_logits != NULL); // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. - *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; + *original_logits = {logits, logits + n_vocab}; } // apply params.logit_bias map @@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl( llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); } - cur.clear(); + cur.resize(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } llama_token_data_array cur_p = { cur.data(), cur.size(), false };