server : export max observed n_past value (#15361)

author Oleksandr Kuvshynov <redacted>

Sun, 17 Aug 2025 22:28:58 +0000 (18:28 -0400)

committer GitHub <redacted>

Sun, 17 Aug 2025 22:28:58 +0000 (00:28 +0200)
author Oleksandr Kuvshynov <redacted>
Sun, 17 Aug 2025 22:28:58 +0000 (18:28 -0400)
committer GitHub <redacted>
Sun, 17 Aug 2025 22:28:58 +0000 (00:28 +0200)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 0b40f7bfa42581b707690f580ccb784dd91b387d..24bfae78919a44c0c94cd918a5b6b921772cce2b 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
      uint64_t n_tokens_predicted_total        = 0;
      uint64_t t_tokens_generation_total       = 0;
  
+    uint64_t n_past_max = 0;
+
      uint64_t n_prompt_tokens_processed = 0;
      uint64_t t_prompt_processing       = 0;
  
@@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
              { "n_tokens_predicted_total",        n_tokens_predicted_total },
              { "t_prompt_processing_total",       t_prompt_processing_total },
  
+            { "n_past_max",                      n_past_max },
+
              { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
              { "t_prompt_processing",             t_prompt_processing },
              { "n_tokens_predicted",              n_tokens_predicted },
@@ -1587,6 +1591,8 @@ struct server_metrics {
      uint64_t n_tokens_predicted_total        = 0;
      uint64_t t_tokens_generation_total       = 0;
  
+    uint64_t n_past_max = 0;
+
      uint64_t n_prompt_tokens_processed = 0;
      uint64_t t_prompt_processing       = 0;
  
@@ -1605,6 +1611,10 @@ struct server_metrics {
          n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
          t_prompt_processing             += slot.t_prompt_processing;
          t_prompt_processing_total       += slot.t_prompt_processing;
+
+        if (slot.n_past > 0) {
+            n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+        }
      }
  
      void on_prediction(const server_slot & slot) {
@@ -1620,6 +1630,9 @@ struct server_metrics {
              if (slot.is_processing()) {
                  n_busy_slots_total++;
              }
+            if (slot.n_past > 0) {
+                n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+            }
          }
      }
  
@@ -2875,6 +2888,8 @@ struct server_context {
                      res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
                      res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
  
+                    res->n_past_max = metrics.n_past_max;
+
                      res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
                      res->t_prompt_processing       = metrics.t_prompt_processing;
                      res->n_tokens_predicted        = metrics.n_tokens_predicted;
@@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
                      {"name",  "n_decode_total"},
                      {"help",  "Total number of llama_decode() calls"},
                      {"value",  res_metrics->n_decode_total}
+            }, {
+                    {"name",  "n_past_max"},
+                    {"help",  "Largest observed n_past."},
+                    {"value",  res_metrics->n_past_max}
              }, {
                      {"name",  "n_busy_slots_per_decode"},
                      {"help",  "Average number of busy slots per llama_decode() call"},
author	Oleksandr Kuvshynov <redacted>
	Sun, 17 Aug 2025 22:28:58 +0000 (18:28 -0400)
committer	GitHub <redacted>
	Sun, 17 Aug 2025 22:28:58 +0000 (00:28 +0200)