{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion.chunk"},
+ });
+
+ // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
+ // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
+ deltas.push_back({
+ {"choices", json::array()},
+ {"created", t},
+ {"id", oaicompat_cmpl_id},
+ {"model", oaicompat_model},
+ {"system_fingerprint", build_info},
+ {"object", "chat.completion.chunk"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
content = ""
last_cmpl_id = None
for i, data in enumerate(res):
- choice = data["choices"][0]
- if i == 0:
- # Check first role message for stream=True
- assert choice["delta"]["content"] is None
- assert choice["delta"]["role"] == "assistant"
+ if data["choices"]:
+ choice = data["choices"][0]
+ if i == 0:
+ # Check first role message for stream=True
+ assert choice["delta"]["content"] is None
+ assert choice["delta"]["role"] == "assistant"
+ else:
+ assert "role" not in choice["delta"]
+ assert data["system_fingerprint"].startswith("b")
+ assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
+ if last_cmpl_id is None:
+ last_cmpl_id = data["id"]
+ assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
+ if choice["finish_reason"] in ["stop", "length"]:
+ assert "content" not in choice["delta"]
+ assert match_regex(re_content, content)
+ assert choice["finish_reason"] == finish_reason
+ else:
+ assert choice["finish_reason"] is None
+ content += choice["delta"]["content"] or ''
else:
- assert "role" not in choice["delta"]
- assert data["system_fingerprint"].startswith("b")
- assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
- if last_cmpl_id is None:
- last_cmpl_id = data["id"]
- assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
- if choice["finish_reason"] in ["stop", "length"]:
assert data["usage"]["prompt_tokens"] == n_prompt
assert data["usage"]["completion_tokens"] == n_predicted
- assert "content" not in choice["delta"]
- assert match_regex(re_content, content)
- assert choice["finish_reason"] == finish_reason
- else:
- assert choice["finish_reason"] is None
- content += choice["delta"]["content"] or ''
def test_chat_completion_with_openai_library():
assert data["choices"][0]["delta"]["role"] == "assistant"
assert "timings" not in data, f'First event should not have timings: {data}'
else:
- assert "role" not in data["choices"][0]["delta"]
- assert "timings" in data
- assert "prompt_per_second" in data["timings"]
- assert "predicted_per_second" in data["timings"]
- assert "predicted_n" in data["timings"]
- assert data["timings"]["predicted_n"] <= 10
+ if data["choices"]:
+ assert "role" not in data["choices"][0]["delta"]
+ else:
+ assert "timings" in data
+ assert "prompt_per_second" in data["timings"]
+ assert "predicted_per_second" in data["timings"]
+ assert "predicted_n" in data["timings"]
+ assert data["timings"]["predicted_n"] <= 10
def test_logprobs():
output_text = ''
aggregated_text = ''
for i, data in enumerate(res):
- choice = data.choices[0]
- if i == 0:
- # Check first role message for stream=True
- assert choice.delta.content is None
- assert choice.delta.role == "assistant"
- else:
- assert choice.delta.role is None
- if choice.finish_reason is None:
- if choice.delta.content:
- output_text += choice.delta.content
- assert choice.logprobs is not None
- assert choice.logprobs.content is not None
- for token in choice.logprobs.content:
- aggregated_text += token.token
- assert token.logprob <= 0.0
- assert token.bytes is not None
- assert token.top_logprobs is not None
- assert len(token.top_logprobs) > 0
+ if data.choices:
+ choice = data.choices[0]
+ if i == 0:
+ # Check first role message for stream=True
+ assert choice.delta.content is None
+ assert choice.delta.role == "assistant"
+ else:
+ assert choice.delta.role is None
+ if choice.finish_reason is None:
+ if choice.delta.content:
+ output_text += choice.delta.content
+ assert choice.logprobs is not None
+ assert choice.logprobs.content is not None
+ for token in choice.logprobs.content:
+ aggregated_text += token.token
+ assert token.logprob <= 0.0
+ assert token.bytes is not None
+ assert token.top_logprobs is not None
+ assert len(token.top_logprobs) > 0
assert aggregated_text == output_text
arguments_parts = 0
for chunk in self.make_stream_request(method, path, data, headers):
- assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}'
- choice = chunk['choices'][0]
- if choice['delta'].get('content') is not None:
- assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
- content.append(choice['delta']['content'])
- content_parts += 1
- if choice['delta'].get('reasoning_content') is not None:
- assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
- reasoning_content.append(choice['delta']['reasoning_content'])
- reasoning_content_parts += 1
- if choice['delta'].get('finish_reason') is not None:
- finish_reason = choice['delta']['finish_reason']
- for tc in choice['delta'].get('tool_calls', []):
- if 'function' not in tc:
- raise ValueError(f"Expected function type, got {tc['type']}")
- if tc['index'] >= len(tool_calls):
- assert 'id' in tc
- assert tc.get('type') == 'function'
- assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \
- f"Expected function call with name, got {tc.get('function')}"
- tool_calls.append(dict(
- id="",
- type="function",
- function=dict(
- name="",
- arguments="",
- )
- ))
- tool_call = tool_calls[tc['index']]
- if tc.get('id') is not None:
- tool_call['id'] = tc['id']
- fct = tc['function']
- assert 'id' not in fct, f"Function call should not have id: {fct}"
- if fct.get('name') is not None:
- tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
- if fct.get('arguments') is not None:
- tool_call['function']['arguments'] += fct['arguments']
- arguments_parts += 1
- tool_call_parts += 1
-
+ if chunk['choices']:
+ assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}'
+ choice = chunk['choices'][0]
+ if choice['delta'].get('content') is not None:
+ assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
+ content.append(choice['delta']['content'])
+ content_parts += 1
+ if choice['delta'].get('reasoning_content') is not None:
+ assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
+ reasoning_content.append(choice['delta']['reasoning_content'])
+ reasoning_content_parts += 1
+ if choice['delta'].get('finish_reason') is not None:
+ finish_reason = choice['delta']['finish_reason']
+ for tc in choice['delta'].get('tool_calls', []):
+ if 'function' not in tc:
+ raise ValueError(f"Expected function type, got {tc['type']}")
+ if tc['index'] >= len(tool_calls):
+ assert 'id' in tc
+ assert tc.get('type') == 'function'
+ assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \
+ f"Expected function call with name, got {tc.get('function')}"
+ tool_calls.append(dict(
+ id="",
+ type="function",
+ function=dict(
+ name="",
+ arguments="",
+ )
+ ))
+ tool_call = tool_calls[tc['index']]
+ if tc.get('id') is not None:
+ tool_call['id'] = tc['id']
+ fct = tc['function']
+ assert 'id' not in fct, f"Function call should not have id: {fct}"
+ if fct.get('name') is not None:
+ tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
+ if fct.get('arguments') is not None:
+ tool_call['function']['arguments'] += fct['arguments']
+ arguments_parts += 1
+ tool_call_parts += 1
+ else:
+ # When `include_usage` is True (the default), we expect the last chunk of the stream
+ # immediately preceding the `data: [DONE]` message to contain a `choices` field with an empty array
+ # and a `usage` field containing the usage statistics (n.b., llama-server also returns `timings` in
+ # the last chunk)
+ assert 'usage' in chunk, f"Expected finish_reason in chunk: {chunk}"
+ assert 'timings' in chunk, f"Expected finish_reason in chunk: {chunk}"
print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
result = dict(
choices=[