server: add test for token probs (#7347)

author Johannes Gäßler <redacted>

Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)

committer GitHub <redacted>

Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)
author Johannes Gäßler <redacted>
Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)
committer GitHub <redacted>
Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 4f3262cdd2d0e73b75ab5c9ba48edb2b3306cbb1..0c3db8c84c69d0ffbc766b76bee88d338c943ba5 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -48,7 +48,7 @@ The project is under active development, and we are [looking for feedback and co
  - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
  - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
  - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
-- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
  - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
  - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
  - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature

index aa0b8d0c648b4c8882349193ba60997d7672e8ff..5deb278c2a53ce47032e468f3f3ad7b1f90dadc7 100644 (file)
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@@ -70,12 +70,48 @@ Feature: Results
      Then all predictions are equal
      Examples:
        | n_parallel | temp |
-      |  1         | 0.0  |
-      |  2         | 0.0  |
-      |  4         | 0.0  |
-      |  1         | 1.0  |
-      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      | 1          | 0.0  |
+      | 2          | 0.0  |
+      | 4          | 0.0  |
+      | 1          | 1.0  |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
        # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
-      # |  2         | 1.0  |
-      # |  4         | 1.0  |
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 2          | 1.0  |
+      # | 4          | 1.0  |
+
+  Scenario Outline: consistent token probs with same seed and prompt
+    Given <n_slots> slots
+    And   <n_kv> KV cache size
+    And   1.0 temperature
+    And   <n_predict> max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then  the server is idle
+    And   all slots are idle
+
+    Given <n_parallel> prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then the server is idle
+    And  all slots are idle
+
+    Then all token probabilities are equal
+    Examples:
+      | n_slots | n_kv | n_predict | n_parallel |
+      | 4       | 1024 | 1         | 1          |
+      | 4       | 1024 | 1         | 4          |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
+      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 4       | 1024 | 100       | 1          |
+      # This test still fails even the above patches; the first token probabilities are already different.
+      # | 4       | 1024 | 100       | 4          |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py

index 577b87af341952a4c82dda05ba1d10e593c121a7..7da503f2c4b3452c63cca946f7f2846fdd72bb33 100644 (file)
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,7 @@ from prometheus_client import parser
  def step_server_config(context, server_fqdn, server_port):
      context.server_fqdn = server_fqdn
      context.server_port = int(server_port)
+    context.n_threads = None
      context.n_gpu_layer = None
      if 'PORT' in os.environ:
          context.server_port = int(os.environ['PORT'])
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
      context.n_gpu_layer = ngl
  
  
+@step('{n_threads:d} threads')
+def step_n_threads(context, n_threads):
+    context.n_thread = n_threads
+
+
  @step('{draft:d} as draft')
  def step_draft(context, draft):
      context.draft = draft
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
  
  @step('all predictions are different')
  @async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
      n_completions = await gather_tasks_results(context)
      assert n_completions >= 2, "need at least 2 completions"
      assert_all_predictions_different(context.tasks_result)
      context.tasks_result = []
  
  
+@step('all token probabilities are equal')
+@async_run_until_complete
+async def step_token_probabilities_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_token_probabilities_equal(context.tasks_result)
+    context.tasks_result = []
+
+
  @step('the completion is  truncated')
  def step_assert_completion_truncated(context):
      step_assert_completion_truncated(context, '')
@@ -869,6 +884,7 @@ async def request_completion(prompt,
                                      "id_slot": id_slot,
                                      "seed": seed if seed is not None else 42,
                                      "temperature": temperature if temperature is not None else "0.8f",
+                                    "n_probs": 2,
                                  },
                                  headers=headers,
                                  timeout=3600) as response:
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
          assert content_i != content_j, "contents not different"
  
  
+def assert_all_token_probabilities_equal(completion_responses):
+    n_predict = len(completion_responses[0]['completion_probabilities'])
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        for pos in range(n_predict):
+            for i, response_i in enumerate(completion_responses):
+                probs_i = response_i['completion_probabilities'][pos]['probs']
+                print(f"pos {pos}, probs {i}: {probs_i}")
+    for pos in range(n_predict):
+        for i, response_i in enumerate(completion_responses):
+            probs_i = response_i['completion_probabilities'][pos]['probs']
+            for j, response_j in enumerate(completion_responses):
+                if i == j:
+                    continue
+                probs_j = response_j['completion_probabilities'][pos]['probs']
+            assert probs_i == probs_j, "contents not equal"
+
+
  async def gather_tasks_results(context):
      n_tasks = len(context.concurrent_tasks)
      if context.debug:
@@ -1261,6 +1294,8 @@ def start_server_background(context):
          server_args.extend(['--batch-size', context.n_batch])
      if context.n_ubatch:
          server_args.extend(['--ubatch-size', context.n_ubatch])
+    if context.n_threads:
+        server_args.extend(['--threads', context.threads])
      if context.n_gpu_layer:
          server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
      if context.draft is not None:
author	Johannes Gäßler <redacted>
	Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)
committer	GitHub <redacted>
	Sun, 19 May 2024 14:26:02 +0000 (16:26 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/tests/features/results.feature		patch \| blob \| history
examples/server/tests/features/steps/steps.py		patch \| blob \| history