From: Georgi Gerganov Date: Fri, 9 Jan 2026 10:59:50 +0000 (+0200) Subject: server : fix timing of prompt/generation (#18713) X-Git-Tag: upstream/0.0.7721~37 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=53eb9435da3affa12a38a8b0fb29081698a8d1cc;p=pkg%2Fggml%2Fsources%2Fllama.cpp server : fix timing of prompt/generation (#18713) --- diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e1f65dfc..324c3af3 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2615,10 +2615,6 @@ private: // on successful decode, restore the original batch size n_batch = llama_n_batch(ctx); - // technically, measuring the time here excludes the sampling time for the last batch - // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok - const int64_t t_current = ggml_time_us(); - for (auto & slot : slots) { // may need to copy state to other slots if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) { @@ -2685,6 +2681,9 @@ private: common_sampler_accept(slot.smpl.get(), id, true); + // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement + const int64_t t_current = ggml_time_us(); + slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2728,6 +2727,8 @@ private: slot.i_batch_dft.clear(); slot.drafted.clear(); + const int64_t t_current = ggml_time_us(); + slot.n_decoded += ids.size(); slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3;