common : more accurate sampling timing (#17382)

author Georgi Gerganov <redacted>

Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)

committer GitHub <redacted>

Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)
author Georgi Gerganov <redacted>
Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)
committer GitHub <redacted>
Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index 4dc95dcba2260a4c181499df8b2f511ceed44531..f3cc55247e718e41db830395c82876a298e05de6 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -26,7 +26,6 @@
  #include <sstream>
  #include <string>
  #include <thread>
-#include <unordered_map>
  #include <unordered_set>
  #include <vector>
  
@@ -60,6 +59,14 @@
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
  
+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+common_time_meas::~common_time_meas() {
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
+    }
+}
+
  //
  // CPU utils
  //
diff --git a/common/common.h b/common/common.h

index f42c083faa254717740b82321754df01fc30833a..de5b404dd88952795151fbf048abd1e1183a28c6 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,15 @@
  
  #pragma once
  
+#include "ggml-opt.h"
+#include "llama-cpp.h"
+
  #include <set>
  #include <sstream>
  #include <string>
  #include <string_view>
  #include <vector>
  #include <map>
-#include <sstream>
-#include <cmath>
-
-#include "ggml-opt.h"
-#include "llama-cpp.h"
  
  #ifdef _WIN32
  #define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@
  
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
  
+struct common_time_meas {
+    common_time_meas(int64_t & t_acc, bool disable = false);
+    ~common_time_meas();
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
  struct common_adapter_lora_info {
      std::string path;
      float scale;
diff --git a/common/sampling.cpp b/common/sampling.cpp

index c69d525b5b35880f9816f145c9841ac11bc7bc1a..7a6b7be1e0ee6c23250e9c670ce594d198347b73 100644 (file)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,9 +3,10 @@
  #include "common.h"
  #include "log.h"
  
+#include <algorithm>
  #include <cmath>
+#include <cstring>
  #include <unordered_map>
-#include <algorithm>
  
  // the ring buffer works similarly to std::deque, but with a fixed capacity
  // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {
  
      llama_token_data_array cur_p;
  
+    void reset() {
+        prev.clear();
+
+        llama_sampler_reset(grmr);
+        llama_sampler_reset(chain);
+    }
+
      void set_logits(struct llama_context * ctx, int idx) {
          const auto * logits = llama_get_logits_ith(ctx, idx);
  
@@ -128,6 +136,12 @@ struct common_sampler {
  
          cur_p = { cur.data(), cur.size(), -1, false };
      }
+
+    common_time_meas tm() {
+        return common_time_meas(t_total_us, params.no_perf);
+    }
+
+    mutable int64_t t_total_us = 0;
  };
  
  std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
  }
  
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    const auto tm = gsmpl->tm();
+
      if (accept_grammar) {
          llama_sampler_accept(gsmpl->grmr, token);
      }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
  }
  
  void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
+    gsmpl->reset();
  }
  
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
      // TODO: measure grammar performance
  
+    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
+
+    llama_perf_sampler_data data_smpl;
+    llama_perf_context_data data_ctx;
+
+    memset(&data_smpl, 0, sizeof(data_smpl));
+    memset(&data_ctx,  0, sizeof(data_ctx));
+
      if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        auto & data = data_smpl;
+
+        data = llama_perf_sampler(gsmpl->chain);
+
+        // note: the sampling time includes the samplers time + extra time spent in common/sampling
+        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
+        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
      }
+
      if (ctx) {
-        llama_perf_context_print(ctx);
+        auto & data = data_ctx;
+
+        data = llama_perf_context(ctx);
+
+        const double t_end_ms = 1e-3 * ggml_time_us();
+
+        const double t_total_ms = t_end_ms - data.t_start_ms;
+        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
+        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
+
+        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
+        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
+
          llama_memory_breakdown_print(ctx);
      }
  }
  
  llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    llama_synchronize(ctx);
+
+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+    const auto tm = gsmpl->tm();
+
      gsmpl->set_logits(ctx, idx);
  
      auto & grmr  = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
  // helpers
  
  llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    const auto tm = gsmpl->tm();
+
      auto * res = &gsmpl->cur_p;
  
      if (do_sort && !res->sorted) {
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp

index cefa39a57c886e1ac4d15d9bbedb743c72d969f8..80c693ce61c010db3f14cb68139f7d1640c83841 100644 (file)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
  #include "llama.h"
  #include "ggml.h"
  
+#include <cmath>
  #include <cstdio>
  #include <string>
  #include <vector>
-#include <numeric>
  
  /**
   * This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
      return u.f;
  }
  
-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
      size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
      float v;
      if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
      } else if (type == GGML_TYPE_F32) {
-        v = *(float *) &data[i];
+        v = *(const float *) &data[i];
      } else if (type == GGML_TYPE_I64) {
-        v = (float) *(int64_t *) &data[i];
+        v = (float) *(const int64_t *) &data[i];
      } else if (type == GGML_TYPE_I32) {
-        v = (float) *(int32_t *) &data[i];
+        v = (float) *(const int32_t *) &data[i];
      } else if (type == GGML_TYPE_I16) {
-        v = (float) *(int16_t *) &data[i];
+        v = (float) *(const int16_t *) &data[i];
      } else if (type == GGML_TYPE_I8) {
-        v = (float) *(int8_t *) &data[i];
+        v = (float) *(const int8_t *) &data[i];
      } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
      } else {
          GGML_ABORT("fatal error");
      }
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp

index 6ec709dd323a6d5319d67d64f607c0acc85f3267..c7a1880aad3db1eef1bf89334d4976a943a72b56 100644 (file)
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
  time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
  
  time_meas::~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
      }
+}
  
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
      ggml_log_set(log_callback, user_data);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp

index adb3f8810ed33bbbaeeaa890bb5f0629cfb8f35b..3f4a729bc36c7d81a4d33b661755ae1e7da587f4 100644 (file)
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
      for (auto * smpl : chain->samplers) {
          llama_sampler_reset(smpl);
      }
-
-    chain->t_sample_us = 0;
-    chain->n_sample    = 0;
  }
  
  static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
  void llama_perf_sampler_print(const struct llama_sampler * chain) {
      const auto data = llama_perf_sampler(chain);
  
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+    LLAMA_LOG_INFO("%s:    samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
  }
  
  void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
  
      auto * ctx = (struct llama_sampler_chain *) chain->ctx;
  
-    ctx->t_sample_us = ctx->n_sample = 0;
+    ctx->t_sample_us = 0;
+    ctx->n_sample    = 0;
  }
diff --git a/tools/main/main.cpp b/tools/main/main.cpp

index 33e88623357934bd09e9e32b19de3afe5440c266..78b42267b59c3442be248922793d03a8ac1ae827 100644 (file)
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    auto * mem = llama_get_memory(ctx);
-
+    llama_memory_t mem = llama_get_memory(ctx);
      const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // note: the time for chat template initialization is not negligible:
      auto chat_templates = common_chat_templates_init(model, params.chat_template);
  
+    // start measuring performance timings from here
+    llama_perf_context_reset(ctx);
+
      LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
  
      auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
author	Georgi Gerganov <redacted>
	Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)
committer	GitHub <redacted>
	Thu, 20 Nov 2025 11:40:10 +0000 (13:40 +0200)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
common/sampling.cpp		patch \| blob \| history
examples/eval-callback/eval-callback.cpp		patch \| blob \| history
src/llama-impl.cpp		patch \| blob \| history
src/llama-sampling.cpp		patch \| blob \| history
tools/main/main.cpp		patch \| blob \| history