server: return_progress to also report 0% processing state (#18305)

author Xuan-Son Nguyen <redacted>

Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)

committer GitHub <redacted>

Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)
author Xuan-Son Nguyen <redacted>
Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)
committer GitHub <redacted>
Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp

index e2e7bcb9e6cb2827e16f603f2166c1199a22020d..94825dc8621fa8c9a61cd4ed8d376513e8ad029f 100644 (file)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2313,6 +2313,12 @@ private:
                          slot.n_prompt_tokens_processed = 0;
  
                          slot.prompt.tokens.keep_first(n_past);
+
+                        // send initial 0% progress update if needed
+                        // this is to signal the client that the request has started processing
+                        if (slot.task->params.stream && slot.task->params.return_progress) {
+                            send_partial_response(slot, {}, true);
+                        }
                      }
  
                      if (!slot.can_split()) {
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py

index 64f3158b986f72799d2e186810d382f3383bd4c4..5f5de415cf8f3a70654a172de3912f2c71cbaca5 100644 (file)
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
  @pytest.mark.parametrize(
      "n_batch,batch_count,reuse_cache",
      [
-        (64, 3, False),
-        (64, 1, True),
+        (64, 4, False),
+        (64, 2, True),
      ]
  )
  def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,10 +462,18 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
      res = make_cmpl_request()
      last_progress = None
      total_batch_count = 0
+
      for data in res:
          cur_progress = data.get("prompt_progress", None)
          if cur_progress is None:
              continue
+        if total_batch_count == 0:
+            # first progress report must have n_cache == n_processed
+            assert cur_progress["total"] > 0
+            assert cur_progress["cache"] == cur_progress["processed"]
+            if reuse_cache:
+                # when reusing cache, we expect some cached tokens
+                assert cur_progress["cache"] > 0
          if last_progress is not None:
              assert cur_progress["total"] == last_progress["total"]
              assert cur_progress["cache"] == last_progress["cache"]
@@ -473,6 +481,7 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
          total_batch_count += 1
          last_progress = cur_progress
  
+    # last progress should indicate completion (all tokens processed)
      assert last_progress is not None
      assert last_progress["total"] > 0
      assert last_progress["processed"] == last_progress["total"]
author	Xuan-Son Nguyen <redacted>
	Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)
committer	GitHub <redacted>
	Tue, 23 Dec 2025 20:49:05 +0000 (21:49 +0100)
tools/server/server-context.cpp		patch \| blob \| history
tools/server/tests/unit/test_chat_completion.py		patch \| blob \| history