server: bench: minor fixes (#10765)

author Pierrick Hymbert <redacted>

Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)

committer GitHub <redacted>

Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)
author Pierrick Hymbert <redacted>
Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)
committer GitHub <redacted>
Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md

index 353368e13b0c8b742e08fd1f70f2006dde1e9fd1..9549795ec29f913010df126043f23944ce56eef5 100644 (file)
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
  
  SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
  
-Example:
+Example (assuming golang >= 1.21 is installed):
  ```shell
  go install go.k6.io/xk6/cmd/xk6@latest
-xk6 build master \
+$GOPATH/bin/xk6 build master \
  --with github.com/phymbert/xk6-sse
  ```
  
@@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
  
  Example:
  ```shell
-server --host localhost --port 8080 \
+llama-server --host localhost --port 8080 \
    --model ggml-model-q4_0.gguf \
    --cont-batching \
    --metrics \
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py

index a9ed747f51db55763d228f973ae252dd98e7325a..5cc6f92ab6c53ee6eae18fe3eee02281fd4095e9 100644 (file)
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -189,12 +189,12 @@ xychart-beta
          "pp": {
              "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
              "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
-            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
          },
          "tg": {
              "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
              "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
-            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
          },
      }
      with open("results.github.env", 'a') as github_env:
@@ -214,11 +214,14 @@ def start_benchmark(args):
      k6_args = [
          'run', args.scenario,
          '--no-color',
+        '--no-connection-reuse',
+        '--no-vu-connection-reuse',
      ]
      k6_args.extend(['--duration', args.duration])
      k6_args.extend(['--iterations', args.n_prompts])
      k6_args.extend(['--vus', args.parallel])
      k6_args.extend(['--summary-export', 'k6-results.json'])
+    k6_args.extend(['--out', 'csv=k6-results.csv'])
      args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
      args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
      print(f"bench: starting k6 with: {args}")
@@ -231,7 +234,7 @@ def start_server(args):
      server_process = start_server_background(args)
  
      attempts = 0
-    max_attempts = 20
+    max_attempts = 600
      if 'GITHUB_ACTIONS' in os.environ:
          max_attempts *= 2
  
@@ -242,7 +245,15 @@ def start_server(args):
          print(f"bench:     waiting for server to start ...")
          time.sleep(0.5)
  
-    print("bench: server started.")
+    attempts = 0
+    while not is_server_ready(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not ready"
+        print(f"bench:     waiting for server to be ready ...")
+        time.sleep(0.5)
+
+    print("bench: server started and ready.")
      return server_process
  
  
@@ -255,11 +266,6 @@ def start_server_background(args):
          '--host', args.host,
          '--port', args.port,
      ]
-    model_file = args.model_path_prefix + os.path.sep + args.hf_file
-    model_dir  = os.path.dirname(model_file)
-    if not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-    server_args.extend(['--model', model_file])
      server_args.extend(['--hf-repo', args.hf_repo])
      server_args.extend(['--hf-file', args.hf_file])
      server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
          return _is_server_listening
  
  
+def is_server_ready(server_fqdn, server_port):
+    url = f"http://{server_fqdn}:{server_port}/health"
+    response = requests.get(url)
+    return response.status_code == 200
+
+
  def escape_metric_name(metric_name):
      return re.sub('[^A-Z0-9]', '_', metric_name.upper())
  
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js

index bdf4f5abc87f79f9d7a9f7b252884ceb1117df05..2772bee5e5f38c33d01c2d61dbd482ab47f0a1f6 100644 (file)
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
  
  const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
  const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
  
  const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
  const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -89,6 +90,9 @@ export default function () {
          ],
          "model": model,
          "stream": true,
+        "stream_options": {
+          "include_usage": true, // False to be supported in llama.cpp server
+        },
          "seed": 42,
          "max_tokens": max_tokens,
          "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@@ -105,12 +109,20 @@ export default function () {
          client.on('event', function (event) {
              if (promptEvalEndTime == null) {
                  promptEvalEndTime = new Date()
+                llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+            }
+
+            if (event.data === '[DONE]' || event.data === '') {
+                return
              }
  
              let chunk = JSON.parse(event.data)
-            let choice = chunk.choices[0]
-            if (choice.finish_reason) {
-                finish_reason = choice.finish_reason
+
+            if (chunk.choices && chunk.choices.length > 0) {
+                let choice = chunk.choices[0]
+                if (choice.finish_reason) {
+                    finish_reason = choice.finish_reason
+                }
              }
  
              if (chunk.usage) {
author	Pierrick Hymbert <redacted>
	Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)
committer	GitHub <redacted>
	Thu, 2 Jan 2025 17:06:12 +0000 (18:06 +0100)
examples/server/bench/README.md		patch \| blob \| history
examples/server/bench/bench.py		patch \| blob \| history
examples/server/bench/script.js		patch \| blob \| history