server : fix timing of prompt/generation (#18713)

author Georgi Gerganov <redacted>

Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)

committer GitHub <redacted>

Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)
author Georgi Gerganov <redacted>
Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)
committer GitHub <redacted>
Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp

index e1f65dfcceccac91f9caf083381f698c1c4e630d..324c3af30c14c5b34d1aa8fbad40ac8a57946d0b 100644 (file)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2615,10 +2615,6 @@ private:
              // on successful decode, restore the original batch size
              n_batch = llama_n_batch(ctx);
  
-            // technically, measuring the time here excludes the sampling time for the last batch
-            // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
-            const int64_t t_current = ggml_time_us();
-
              for (auto & slot : slots) {
                  // may need to copy state to other slots
                  if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
@@ -2685,6 +2681,9 @@ private:
  
                  common_sampler_accept(slot.smpl.get(), id, true);
  
+                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
+                const int64_t t_current = ggml_time_us();
+
                  slot.n_decoded += 1;
  
                  if (slot.n_decoded == 1) {
@@ -2728,6 +2727,8 @@ private:
                  slot.i_batch_dft.clear();
                  slot.drafted.clear();
  
+                const int64_t t_current = ggml_time_us();
+
                  slot.n_decoded += ids.size();
  
                  slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
author	Georgi Gerganov <redacted>
	Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)
committer	GitHub <redacted>
	Fri, 9 Jan 2026 10:59:50 +0000 (12:59 +0200)