uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;
+ uint64_t n_past_max = 0;
+
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
{ "n_tokens_predicted_total", n_tokens_predicted_total },
{ "t_prompt_processing_total", t_prompt_processing_total },
+ { "n_past_max", n_past_max },
+
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
{ "t_prompt_processing", t_prompt_processing },
{ "n_tokens_predicted", n_tokens_predicted },
uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;
+ uint64_t n_past_max = 0;
+
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
t_prompt_processing += slot.t_prompt_processing;
t_prompt_processing_total += slot.t_prompt_processing;
+
+ if (slot.n_past > 0) {
+ n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+ }
}
void on_prediction(const server_slot & slot) {
if (slot.is_processing()) {
n_busy_slots_total++;
}
+ if (slot.n_past > 0) {
+ n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+ }
}
}
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
res->t_tokens_generation_total = metrics.t_tokens_generation_total;
+ res->n_past_max = metrics.n_past_max;
+
res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
res->t_prompt_processing = metrics.t_prompt_processing;
res->n_tokens_predicted = metrics.n_tokens_predicted;
{"name", "n_decode_total"},
{"help", "Total number of llama_decode() calls"},
{"value", res_metrics->n_decode_total}
+ }, {
+ {"name", "n_past_max"},
+ {"help", "Largest observed n_past."},
+ {"value", res_metrics->n_past_max}
}, {
{"name", "n_busy_slots_per_decode"},
{"help", "Average number of busy slots per llama_decode() call"},