"pp": {
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
- "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
+ "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
},
"tg": {
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
- "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
+ "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
},
}
with open("results.github.env", 'a') as github_env:
k6_args = [
'run', args.scenario,
'--no-color',
+ '--no-connection-reuse',
+ '--no-vu-connection-reuse',
]
k6_args.extend(['--duration', args.duration])
k6_args.extend(['--iterations', args.n_prompts])
k6_args.extend(['--vus', args.parallel])
k6_args.extend(['--summary-export', 'k6-results.json'])
+ k6_args.extend(['--out', 'csv=k6-results.csv'])
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
print(f"bench: starting k6 with: {args}")
server_process = start_server_background(args)
attempts = 0
- max_attempts = 20
+ max_attempts = 600
if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2
print(f"bench: waiting for server to start ...")
time.sleep(0.5)
- print("bench: server started.")
+ attempts = 0
+ while not is_server_ready(args.host, args.port):
+ attempts += 1
+ if attempts > max_attempts:
+ assert False, "server not ready"
+ print(f"bench: waiting for server to be ready ...")
+ time.sleep(0.5)
+
+ print("bench: server started and ready.")
return server_process
'--host', args.host,
'--port', args.port,
]
- model_file = args.model_path_prefix + os.path.sep + args.hf_file
- model_dir = os.path.dirname(model_file)
- if not os.path.exists(model_dir):
- os.makedirs(model_dir)
- server_args.extend(['--model', model_file])
server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file])
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
return _is_server_listening
+def is_server_ready(server_fqdn, server_port):
+ url = f"http://{server_fqdn}:{server_port}/health"
+ response = requests.get(url)
+ return response.status_code == 200
+
+
def escape_metric_name(metric_name):
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
],
"model": model,
"stream": true,
+ "stream_options": {
+ "include_usage": true, // False to be supported in llama.cpp server
+ },
"seed": 42,
"max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
client.on('event', function (event) {
if (promptEvalEndTime == null) {
promptEvalEndTime = new Date()
+ llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+ }
+
+ if (event.data === '[DONE]' || event.data === '') {
+ return
}
let chunk = JSON.parse(event.data)
- let choice = chunk.choices[0]
- if (choice.finish_reason) {
- finish_reason = choice.finish_reason
+
+ if (chunk.choices && chunk.choices.length > 0) {
+ let choice = chunk.choices[0]
+ if (choice.finish_reason) {
+ finish_reason = choice.finish_reason
+ }
}
if (chunk.usage) {