From: Daniel Bevenius Date: Thu, 9 May 2024 11:03:29 +0000 (+0200) Subject: llama : update llama_timings.n_p_eval setting (#7160) X-Git-Tag: upstream/0.0.4488~1659 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=fd9f92b154850014146f61717cd292a59a5cee5a;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : update llama_timings.n_p_eval setting (#7160) This commit changes the value assigned to llama_timings.n_p_eval when ctx->n_p_eval is 0 to be 1 instead of 1 which is the current value. The motivation for this change is that if session caching is enabled, for example using the `--prompt-cache main-session.txt` command line argument for the main example, and if the same prompt is used then on subsequent runs, the prompt tokens will not actually be passed to llama_decode, and n_p_eval will not be updated by llama_synchoronize. But the value of n_p_eval will be set 1 by llama_get_timings because ctx->n_p_eval will be 0. This could be interpreted as 1 token was evaluated for the prompt which could be misleading for applications using this value. Signed-off-by: Daniel Bevenius --- diff --git a/llama.cpp b/llama.cpp index 9c72d118..806c2093 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17879,7 +17879,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, /*.n_sample =*/ std::max(1, ctx->n_sample), - /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_p_eval =*/ std::max(0, ctx->n_p_eval), /*.n_eval =*/ std::max(1, ctx->n_eval), };