huggingface_hub>=0.34.0,<1.0
matplotlib~=3.10.0
numpy~=1.26.4
-openai~=1.55.3
+openai~=2.14.0
pandas~=2.2.3
prometheus-client~=0.20.0
requests~=2.32.3
**Features:**
* LLM inference of F16 and quantized models on GPU and CPU
- * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+ * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
* [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
* Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
* Parallel decoding with multi-user support
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
+### POST `/v1/responses`: OpenAI-compatible Responses API
+
+*Options:*
+
+See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+ base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+ api_key = "sk-no-key-required"
+)
+
+response = client.responses.create(
+ model="gpt-4.1",
+ instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+ input="Write a limerick about python exceptions"
+)
+
+print(response.output_text)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/responses \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-4.1",
+"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+"input": "Write a limerick about python exceptions"
+}'
+```
+
+This endpoint works by converting Responses request into Chat Completions request.
+
+
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
return llama_params;
}
+json convert_responses_to_chatcmpl(const json & response_body) {
+ if (!response_body.contains("input")) {
+ throw std::invalid_argument("'input' is required");
+ }
+ if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
+ throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
+ }
+
+ const json input_value = response_body.at("input");
+ json chatcmpl_body = response_body;
+ chatcmpl_body.erase("input");
+ std::vector<json> chatcmpl_messages;
+
+ if (response_body.contains("instructions")) {
+ chatcmpl_messages.push_back({
+ {"role", "system"},
+ {"content", json_value(response_body, "instructions", std::string())},
+ });
+ chatcmpl_body.erase("instructions");
+ }
+
+ if (input_value.is_string()) {
+ // #responses_create-input-text_input
+ chatcmpl_messages.push_back({
+ {"role", "user"},
+ {"content", input_value},
+ });
+ } else if (input_value.is_array()) {
+ // #responses_create-input-input_item_list
+
+ static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
+ return j.contains(key) && j.at(key).is_array();
+ };
+ static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
+ return j.contains(key) && j.at(key).is_string();
+ };
+
+ for (json item : input_value) {
+ if (exists_and_is_string(item, "content")) {
+ // #responses_create-input-input_item_list-input_message-content-text_input
+ // Only "Input message" contains item["content"]::string
+ // After converting item["content"]::string to item["content"]::array,
+ // we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
+ item["content"] = json::array({
+ json {
+ {"text", item.at("content")},
+ {"type", "input_text"}
+ }
+ });
+ }
+
+ if (exists_and_is_array(item, "content") &&
+ exists_and_is_string(item, "role") &&
+ (item.at("role") == "user" ||
+ item.at("role") == "system" ||
+ item.at("role") == "developer")
+ ) {
+ // #responses_create-input-input_item_list-item-input_message
+ std::vector<json> chatcmpl_content;
+
+ for (const json & input_item : item.at("content")) {
+ const std::string type = json_value(input_item, "type", std::string());
+
+ if (type == "input_text") {
+ if (!input_item.contains("text")) {
+ throw std::invalid_argument("'Input text' requires 'text'");
+ }
+ chatcmpl_content.push_back({
+ {"text", input_item.at("text")},
+ {"type", "text"},
+ });
+ } else if (type == "input_image") {
+ // While `detail` is marked as required,
+ // it has default value("auto") and can be omitted.
+
+ if (!input_item.contains("image_url")) {
+ throw std::invalid_argument("'image_url' is required");
+ }
+ chatcmpl_content.push_back({
+ {"image_url", json {
+ {"url", input_item.at("image_url")}
+ }},
+ {"type", "image_url"},
+ });
+ } else if (type == "input_file") {
+ throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
+ // if (input_item.contains("file_url")) {
+ // // chat completion API does not support file_url
+ // throw std::invalid_argument("'file_url' is not supported");
+ // }
+ // if (!input_item.contains("file_data") || !input_item.contains("filename")) {
+ // throw std::invalid_argument("Both 'file_data' and 'filename' are required");
+ // }
+ // chatcmpl_content.push_back({
+ // {"file", json {
+ // {"file_data", input_item.at("file_data")},
+ // {"filename", input_item.at("filename")},
+ // }},
+ // {"type", "file"},
+ // });
+ } else {
+ throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
+ }
+ }
+
+ if (item.contains("type")) {
+ item.erase("type");
+ }
+ if (item.contains("status")) {
+ item.erase("status");
+ }
+ item["content"] = chatcmpl_content;
+
+ chatcmpl_messages.push_back(item);
+ } else if (exists_and_is_array(item, "content") &&
+ exists_and_is_string(item, "role") &&
+ item.at("role") == "assistant" &&
+ // exists_and_is_string(item, "status") &&
+ // (item.at("status") == "in_progress" ||
+ // item.at("status") == "completed" ||
+ // item.at("status") == "incomplete") &&
+ // item["status"] not sent by codex-cli
+ exists_and_is_string(item, "type") &&
+ item.at("type") == "message"
+ ) {
+ // #responses_create-input-input_item_list-item-output_message
+ std::vector<json> chatcmpl_content;
+
+ for (const auto & output_text : item.at("content")) {
+ const std::string type = json_value(output_text, "type", std::string());
+ if (type != "output_text") {
+ throw std::invalid_argument("'type' must be 'output_text'");
+ }
+ if (!exists_and_is_string(output_text, "text")) {
+ throw std::invalid_argument("'Output text' requires 'text'");
+ }
+ // Ignore annotations and logprobs for now
+ chatcmpl_content.push_back({
+ {"text", output_text.at("text")},
+ {"type", "text"},
+ });
+ }
+
+ item.erase("status");
+ item.erase("type");
+ item["content"] = chatcmpl_content;
+ chatcmpl_messages.push_back(item);
+ } else if (exists_and_is_string(item, "arguments") &&
+ exists_and_is_string(item, "call_id") &&
+ exists_and_is_string(item, "name") &&
+ exists_and_is_string(item, "type") &&
+ item.at("type") == "function_call"
+ ) {
+ // #responses_create-input-input_item_list-item-function_tool_call
+ json msg = json {
+ {"role", "assistant"},
+ {"tool_calls", json::array({ json {
+ {"function", json {
+ {"arguments", item.at("arguments")},
+ {"name", item.at("name")},
+ }},
+ {"id", item.at("call_id")},
+ {"type", "function"},
+ }})},
+ };
+
+ if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
+ // Move reasoning content from dummy message to tool call message
+ msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
+ chatcmpl_messages.pop_back();
+ }
+ chatcmpl_messages.push_back(msg);
+ } else if (exists_and_is_string(item, "call_id") &&
+ (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
+ exists_and_is_string(item, "type") &&
+ item.at("type") == "function_call_output"
+ ) {
+ // #responses_create-input-input_item_list-item-function_tool_call_output
+ if (item.at("output").is_string()) {
+ chatcmpl_messages.push_back(json {
+ {"content", item.at("output")},
+ {"role", "tool"},
+ {"tool_call_id", item.at("call_id")},
+ });
+ } else {
+ json chatcmpl_outputs = item.at("output");
+ for (json & chatcmpl_output : chatcmpl_outputs) {
+ if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
+ throw std::invalid_argument("Output of tool call should be 'Input text'");
+ }
+ chatcmpl_output["type"] = "text";
+ }
+ chatcmpl_messages.push_back(json {
+ {"content", chatcmpl_outputs},
+ {"role", "tool"},
+ {"tool_call_id", item.at("call_id")},
+ });
+ }
+ } else if (// exists_and_is_string(item, "id") &&
+ // item["id"] not sent by codex-cli
+ exists_and_is_array(item, "summary") &&
+ exists_and_is_string(item, "type") &&
+ item.at("type") == "reasoning") {
+ // #responses_create-input-input_item_list-item-reasoning
+
+ if (!exists_and_is_array(item, "content")) {
+ throw std::invalid_argument("item['content'] is not an array");
+ }
+ if (item.at("content").empty()) {
+ throw std::invalid_argument("item['content'] is empty");
+ }
+ if (!exists_and_is_string(item.at("content")[0], "text")) {
+ throw std::invalid_argument("item['content']['text'] is not a string");
+ }
+
+ // Pack reasoning content in dummy message
+ chatcmpl_messages.push_back(json {
+ {"role", "assistant"},
+ {"content", json::array()},
+ {"reasoning_content", item.at("content")[0].at("text")},
+ });
+ } else {
+ throw std::invalid_argument("Cannot determine type of 'item'");
+ }
+ }
+ } else {
+ throw std::invalid_argument("'input' must be a string or array of objects");
+ }
+
+ // Remove unused dummy message which contains
+ // reasoning content not followed by tool call
+ chatcmpl_messages.erase(std::remove_if(
+ chatcmpl_messages.begin(),
+ chatcmpl_messages.end(),
+ [](const json & x){ return x.contains("role") &&
+ x.at("role") == "assistant" &&
+ x.contains("content") &&
+ x.at("content") == json::array() &&
+ x.contains("reasoning_content");
+ }),
+ chatcmpl_messages.end()
+ );
+
+ chatcmpl_body["messages"] = chatcmpl_messages;
+
+ if (response_body.contains("tools")) {
+ if (!response_body.at("tools").is_array()) {
+ throw std::invalid_argument("'tools' must be an array of objects");
+ }
+ std::vector<json> chatcmpl_tools;
+ for (json resp_tool : response_body.at("tools")) {
+ json chatcmpl_tool;
+
+ if (json_value(resp_tool, "type", std::string()) != "function") {
+ throw std::invalid_argument("'type' of tool must be 'function'");
+ }
+ resp_tool.erase("type");
+ chatcmpl_tool["type"] = "function";
+
+ if (!resp_tool.contains("strict")) {
+ resp_tool["strict"] = true;
+ }
+ chatcmpl_tool["function"] = resp_tool;
+ chatcmpl_tools.push_back(chatcmpl_tool);
+ }
+ chatcmpl_body.erase("tools");
+ chatcmpl_body["tools"] = chatcmpl_tools;
+ }
+
+ if (response_body.contains("max_output_tokens")) {
+ chatcmpl_body.erase("max_output_tokens");
+ chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
+ }
+
+ return chatcmpl_body;
+}
+
json convert_anthropic_to_oai(const json & body) {
json oai_body;
return ss.str();
}
+std::string format_oai_resp_sse(const json & data) {
+ std::ostringstream ss;
+ auto send_single = [&ss](const json & event_obj) {
+ ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
+ ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
+ };
+
+ if (data.is_array()) {
+ for (const auto & item : data) {
+ send_single(item);
+ }
+ } else {
+ send_single(data);
+ }
+
+ return ss.str();
+}
+
std::string format_anthropic_sse(const json & data) {
std::ostringstream ss;
const server_chat_params & opt,
std::vector<raw_buffer> & out_files);
+// convert OpenAI Responses API format to OpenAI Chat Completions API format
+json convert_responses_to_chatcmpl(const json & body);
+
// convert Anthropic Messages API format to OpenAI Chat Completions API format
json convert_anthropic_to_oai(const json & body);
// note: if data is a json array, it will be sent as multiple events, one per item
std::string format_oai_sse(const json & data);
+std::string format_oai_resp_sse(const json & data);
+
// format Anthropic-style SSE with event types
std::string format_anthropic_sse(const json & data);
json first_result_json = first_result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
res->data = format_anthropic_sse(first_result_json);
+ } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+ res->data = format_oai_resp_sse(first_result_json);
} else {
res->data = format_oai_sse(first_result_json);
}
// check if there is more data
if (!rd.has_next()) {
- if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
- // Anthropic doesn't send [DONE], message_stop was already sent
- output = "";
- } else if (res_type != TASK_RESPONSE_TYPE_NONE) {
- output = "data: [DONE]\n\n";
- } else {
- output = "";
+ switch (res_type) {
+ case TASK_RESPONSE_TYPE_NONE:
+ case TASK_RESPONSE_TYPE_OAI_RESP:
+ case TASK_RESPONSE_TYPE_ANTHROPIC:
+ output = "";
+ break;
+
+ default:
+ output = "data: [DONE]\n\n";
+ break;
}
SRV_DBG("%s", "all results received, terminating stream\n");
return false; // no more data, terminate
json res_json = result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
output = format_anthropic_sse(res_json);
+ } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+ output = format_oai_resp_sse(res_json);
} else {
output = format_oai_sse(res_json);
}
TASK_RESPONSE_TYPE_OAI_CHAT);
};
+ this->post_responses_oai = [this](const server_http_req & req) {
+ auto res = create_response();
+ std::vector<raw_buffer> files;
+ json body = convert_responses_to_chatcmpl(json::parse(req.body));
+ json body_parsed = oaicompat_chat_params_parse(
+ body,
+ meta->chat_params,
+ files);
+ return handle_completions_impl(
+ req,
+ SERVER_TASK_TYPE_COMPLETION,
+ body_parsed,
+ files,
+ TASK_RESPONSE_TYPE_OAI_RESP);
+ };
+
this->post_anthropic_messages = [this](const server_http_req & req) {
auto res = create_response();
std::vector<raw_buffer> files;
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
+ server_http_context::handler_t post_responses_oai;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
server_http_context::handler_t post_apply_template;
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
+ case TASK_RESPONSE_TYPE_OAI_RESP:
+ return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return stream ? to_json_anthropic_stream() : to_json_anthropic();
default:
return deltas;
}
+json server_task_result_cmpl_final::to_json_oaicompat_resp() {
+ common_chat_msg msg;
+ if (!oaicompat_msg.empty()) {
+ msg = oaicompat_msg;
+ } else {
+ msg.role = "assistant";
+ msg.content = content;
+ }
+
+ std::vector<json> output;
+
+ if (msg.reasoning_content != "") {
+ output.push_back(json {
+ {"id", "rs_" + random_string()},
+ {"summary", json::array()},
+ {"type", "reasoning"},
+ {"content", json::array({ json {
+ {"text", msg.reasoning_content},
+ {"type", "reasoning_text"},
+ }})},
+ {"encrypted_content", ""},
+ {"status", "completed"},
+ });
+ }
+
+ if (msg.content != "") {
+ output.push_back(json {
+ {"content", json::array({ json {
+ {"type", "output_text"},
+ {"annotations", json::array()},
+ {"logprobs", json::array()},
+ {"text", msg.content},
+ }})},
+ {"id", "msg_" + random_string()},
+ {"role", msg.role},
+ {"status", "completed"},
+ {"type", "message"},
+ });
+ }
+
+ for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
+ output.push_back(json {
+ {"type", "function_call"},
+ {"status", "completed"},
+ {"arguments", tool_call.arguments},
+ {"call_id", "fc_" + tool_call.id},
+ {"name", tool_call.name},
+ });
+ }
+
+ std::time_t t = std::time(0);
+ json res = {
+ {"completed_at", t},
+ {"created_at", t},
+ {"id", oai_resp_id},
+ {"model", oaicompat_model},
+ {"object", "response"},
+ {"output", output},
+ {"status", "completed"},
+ {"usage", json {
+ {"input_tokens", n_prompt_tokens},
+ {"output_tokens", n_decoded},
+ {"total_tokens", n_decoded + n_prompt_tokens},
+ }},
+ };
+
+ return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
+ std::vector<json> server_sent_events;
+ std::vector<json> output;
+
+ if (oaicompat_msg.reasoning_content != "") {
+ const json output_item = json {
+ {"id", oai_resp_reasoning_id},
+ {"summary", json::array()},
+ {"type", "reasoning"},
+ {"content", json::array({ json {
+ {"text", oaicompat_msg.reasoning_content},
+ {"type", "reasoning_text"},
+ }})},
+ {"encrypted_content", ""},
+ };
+
+ server_sent_events.push_back(json {
+ {"event", "response.output_item.done"},
+ {"data", json {
+ {"type", "response.output_item.done"},
+ {"item", output_item}
+ }}
+ });
+ output.push_back(output_item);
+ }
+
+ if (oaicompat_msg.content != "") {
+ server_sent_events.push_back(json {
+ {"event", "response.output_text.done"},
+ {"data", json {
+ {"type", "response.output_text.done"},
+ {"item_id", oai_resp_message_id},
+ {"text", oaicompat_msg.content}
+ }}
+ });
+
+ const json content_part = {
+ {"type", "output_text"},
+ {"annotations", json::array()},
+ {"logprobs", json::array()},
+ {"text", oaicompat_msg.content}
+ };
+
+ server_sent_events.push_back(json {
+ {"event", "response.content_part.done"},
+ {"data", json {
+ {"type", "response.content_part.done"},
+ {"item_id", oai_resp_message_id},
+ {"part", content_part}
+ }}
+ });
+ const json output_item = {
+ {"type", "message"},
+ {"status", "completed"},
+ {"id", oai_resp_message_id},
+ {"content", json::array({content_part})},
+ {"role", "assistant"}
+ };
+
+ server_sent_events.push_back(json {
+ {"event", "response.output_item.done"},
+ {"data", json {
+ {"type", "response.output_item.done"},
+ {"item", output_item}
+ }}
+ });
+ output.push_back(output_item);
+ }
+
+ for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
+ const json output_item = {
+ {"type", "function_call"},
+ {"status", "completed"},
+ {"arguments", tool_call.arguments},
+ {"call_id", "fc_" + tool_call.id},
+ {"name", tool_call.name}
+ };
+ server_sent_events.push_back(json {
+ {"event", "response.output_item.done"},
+ {"data", json {
+ {"type", "response.output_item.done"},
+ {"item", output_item}
+ }}
+ });
+ output.push_back(output_item);
+ }
+
+ std::time_t t = std::time(0);
+ server_sent_events.push_back(json {
+ {"event", "response.completed"},
+ {"data", json {
+ {"type", "response.completed"},
+ {"response", json {
+ {"id", oai_resp_id},
+ {"object", "response"},
+ {"created_at", t},
+ {"status", "completed"},
+ {"model", oaicompat_model},
+ {"output", output},
+ {"usage", json {
+ {"input_tokens", n_prompt_tokens},
+ {"output_tokens", n_decoded},
+ {"total_tokens", n_decoded + n_prompt_tokens}
+ }}
+ }},
+ }}
+ });
+
+ return server_sent_events;
+}
+
json server_task_result_cmpl_final::to_json_anthropic() {
std::string stop_reason = "max_tokens";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
//
// server_task_result_cmpl_partial
//
+void server_task_result_cmpl_partial::update(task_result_state & state) {
+ is_updated = true;
+ state.update_chat_msg(content, true, oaicompat_msg_diffs);
+
+ // Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
+ thinking_block_started = state.thinking_block_started;
+ text_block_started = state.text_block_started;
+
+ oai_resp_id = state.oai_resp_id;
+ oai_resp_reasoning_id = state.oai_resp_reasoning_id;
+ oai_resp_message_id = state.oai_resp_message_id;
+ oai_resp_fc_id = state.oai_resp_fc_id;
+
+ // track if the accumulated message has any reasoning content
+ anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
+
+ // Pre-compute state updates based on diffs (for next chunk)
+ for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
+ if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
+ state.thinking_block_started = true;
+ }
+ if (!diff.content_delta.empty() && !state.text_block_started) {
+ state.text_block_started = true;
+ }
+ if (!diff.tool_call_delta.name.empty()) {
+ state.oai_resp_fc_id = diff.tool_call_delta.id;
+ }
+ }
+}
+
json server_task_result_cmpl_partial::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
switch (res_type) {
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return to_json_oaicompat_chat();
+ case TASK_RESPONSE_TYPE_OAI_RESP:
+ return to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return to_json_anthropic();
default:
return deltas;
}
+json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
+ std::vector<json> events;
+
+ if (n_decoded == 1) {
+ events.push_back(json {
+ {"event", "response.created"},
+ {"data", json {
+ {"type", "response.created"},
+ {"response", json {
+ {"id", oai_resp_id},
+ {"object", "response"},
+ {"status", "in_progress"},
+ }},
+ }},
+ });
+ events.push_back(json {
+ {"event", "response.in_progress"},
+ {"data", json {
+ {"type", "response.in_progress"},
+ {"response", json {
+ {"id", oai_resp_id},
+ {"object", "response"},
+ {"status", "in_progress"},
+ }},
+ }},
+ });
+ }
+
+ for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
+ if (!diff.reasoning_content_delta.empty()) {
+ if (!thinking_block_started) {
+ events.push_back(json {
+ {"event", "response.output_item.added"},
+ {"data", json {
+ {"type", "response.output_item.added"},
+ {"item", json {
+ {"id", oai_resp_reasoning_id},
+ {"summary", json::array()},
+ {"type", "reasoning"},
+ {"content", json::array()},
+ {"encrypted_content", ""},
+ {"status", "in_progress"},
+ }},
+ }},
+ });
+ thinking_block_started = true;
+ }
+ events.push_back(json {
+ {"event", "response.reasoning_text.delta"},
+ {"data", json {
+ {"type", "response.reasoning_text.delta"},
+ {"delta", diff.reasoning_content_delta},
+ {"item_id", oai_resp_reasoning_id},
+ }},
+ });
+ }
+
+ if (!diff.content_delta.empty()) {
+ if (!text_block_started) {
+ events.push_back(json {
+ {"event", "response.output_item.added"},
+ {"data", json {
+ {"type", "response.output_item.added"},
+ {"item", json {
+ {"content", json::array()},
+ {"id", oai_resp_message_id},
+ {"role", "assistant"},
+ {"status", "in_progress"},
+ {"type", "message"},
+ }},
+ }},
+ });
+ events.push_back(json {
+ {"event", "response.content_part.added"},
+ {"data", json {
+ {"type", "response.content_part.added"},
+ {"item_id", oai_resp_message_id},
+ {"part", json {
+ {"type", "output_text"},
+ {"text", ""},
+ }},
+ }},
+ });
+ text_block_started = true;
+ }
+ events.push_back(json {
+ {"event", "response.output_text.delta"},
+ {"data", json {
+ {"type", "response.output_text.delta"},
+ {"item_id", oai_resp_message_id},
+ {"delta", diff.content_delta},
+ }},
+ });
+ }
+
+ if (!diff.tool_call_delta.name.empty()) {
+ events.push_back(json {
+ {"event", "response.output_item.added"},
+ {"data", json {
+ {"type", "response.output_item.added"},
+ {"item", json {
+ {"arguments", ""},
+ {"call_id", "fc_" + diff.tool_call_delta.id},
+ {"name", diff.tool_call_delta.name},
+ {"type", "function_call"},
+ {"status", "in_progress"},
+ }},
+ }},
+ });
+ oai_resp_fc_id = diff.tool_call_delta.id;
+ }
+
+ if (!diff.tool_call_delta.arguments.empty()) {
+ events.push_back(json {
+ {"event", "response.function_call_arguments.delta"},
+ {"data", json {
+ {"type", "response.function_call_arguments.delta"},
+ {"delta", diff.tool_call_delta.arguments},
+ {"item_id", "fc_" + oai_resp_fc_id},
+ }},
+ });
+ }
+ }
+ return events;
+}
+
//
// server_task_result_embd
//
// use local copies of streaming state (copied from task_result_state in update())
// these reflect the state BEFORE this chunk was processed
- bool thinking_started = anthropic_thinking_block_started;
- bool text_started = anthropic_text_block_started;
+ bool thinking_started = thinking_block_started;
+ bool text_started = text_block_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
TASK_RESPONSE_TYPE_OAI_CHAT,
TASK_RESPONSE_TYPE_OAI_CMPL,
+ TASK_RESPONSE_TYPE_OAI_RESP,
TASK_RESPONSE_TYPE_OAI_EMBD,
TASK_RESPONSE_TYPE_ANTHROPIC,
};
std::string generated_text; // append new chunks of generated text here
std::vector<std::string> generated_tool_call_ids;
- // for Anthropic API streaming: track content block state across chunks
- bool anthropic_thinking_block_started = false;
- bool anthropic_text_block_started = false;
+ // for OpenAI Responses and Anthropic streaming API:
+ // track output item / content block state across chunks
+ bool thinking_block_started = false;
+ bool text_block_started = false;
+
+ // for OpenAI Responses streaming API
+ const std::string oai_resp_id;
+ const std::string oai_resp_reasoning_id;
+ const std::string oai_resp_message_id;
+ std::string oai_resp_fc_id; // function call ID for current args delta
task_result_state(const common_chat_parser_params & chat_parser_params)
- : chat_parser_params(chat_parser_params) {}
+ : chat_parser_params(chat_parser_params)
+ , oai_resp_id("resp_" + random_string())
+ , oai_resp_reasoning_id("rs_" + random_string())
+ , oai_resp_message_id("msg_" + random_string()) {}
// parse partial tool calls and update the internal state
common_chat_msg update_chat_msg(
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
+ // for OpenAI Responses API
+ std::string oai_resp_id;
+ std::string oai_resp_reasoning_id;
+ std::string oai_resp_message_id;
+
virtual bool is_stop() override {
return true; // in stream mode, final responses are considered stop
}
virtual void update(task_result_state & state) override {
is_updated = true;
oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
+
+ oai_resp_id = state.oai_resp_id;
+ oai_resp_reasoning_id = state.oai_resp_reasoning_id;
+ oai_resp_message_id = state.oai_resp_message_id;
}
json to_json_non_oaicompat();
json to_json_oaicompat_chat_stream();
+ json to_json_oaicompat_resp();
+
+ json to_json_oaicompat_resp_stream();
+
json to_json_anthropic();
json to_json_anthropic_stream();
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
+ // Streaming state copied from task_result_state for this chunk
+ bool thinking_block_started = false;
+ bool text_block_started = false;
+
+ // for OpenAI Responses API
+ std::string oai_resp_id;
+ std::string oai_resp_reasoning_id;
+ std::string oai_resp_message_id;
+ std::string oai_resp_fc_id;
+
// for Anthropic API: track if any reasoning content has been generated
bool anthropic_has_reasoning = false;
- // Streaming state copied from task_result_state for this chunk
- bool anthropic_thinking_block_started = false;
- bool anthropic_text_block_started = false;
virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop
}
- virtual json to_json() override;
+ virtual void update(task_result_state & state) override;
- virtual void update(task_result_state & state) override {
- is_updated = true;
- state.update_chat_msg(content, true, oaicompat_msg_diffs);
- // track if the accumulated message has any reasoning content
- anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
-
- // Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
- anthropic_thinking_block_started = state.anthropic_thinking_block_started;
- anthropic_text_block_started = state.anthropic_text_block_started;
-
- // Pre-compute state updates based on diffs (for next chunk)
- for (const auto & diff : oaicompat_msg_diffs) {
- if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
- state.anthropic_thinking_block_started = true;
- }
- if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
- state.anthropic_text_block_started = true;
- }
- }
- }
+ virtual json to_json() override;
json to_json_non_oaicompat();
json to_json_oaicompat_chat();
+ json to_json_oaicompat_resp();
+
json to_json_anthropic();
};
routes.post_completions = models_routes->proxy_post;
routes.post_completions_oai = models_routes->proxy_post;
routes.post_chat_completions = models_routes->proxy_post;
+ routes.post_responses_oai = models_routes->proxy_post;
routes.post_anthropic_messages = models_routes->proxy_post;
routes.post_anthropic_count_tokens = models_routes->proxy_post;
routes.post_infill = models_routes->proxy_post;
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
+ ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
numpy~=1.26.4
-openai~=1.55.3
+openai~=2.14.0
prometheus-client~=0.20.0
requests~=2.32.3
wget~=3.2
--- /dev/null
+import pytest
+from openai import OpenAI
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+ global server
+ server = ServerPreset.tinyllama2()
+
+def test_responses_with_openai_library():
+ global server
+ server.start()
+ client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+ res = client.responses.create(
+ model="gpt-4.1",
+ input=[
+ {"role": "system", "content": "Book"},
+ {"role": "user", "content": "What is the best book"},
+ ],
+ max_output_tokens=8,
+ temperature=0.8,
+ )
+ assert res.id.startswith("resp_")
+ assert res.output[0].id is not None
+ assert res.output[0].id.startswith("msg_")
+ assert match_regex("(Suddenly)+", res.output_text)
+
+def test_responses_stream_with_openai_library():
+ global server
+ server.start()
+ client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+ stream = client.responses.create(
+ model="gpt-4.1",
+ input=[
+ {"role": "system", "content": "Book"},
+ {"role": "user", "content": "What is the best book"},
+ ],
+ max_output_tokens=8,
+ temperature=0.8,
+ stream=True,
+ )
+
+ gathered_text = ''
+ resp_id = ''
+ msg_id = ''
+ for r in stream:
+ if r.type == "response.created":
+ assert r.response.id.startswith("resp_")
+ resp_id = r.response.id
+ if r.type == "response.in_progress":
+ assert r.response.id == resp_id
+ if r.type == "response.output_item.added":
+ assert r.item.id is not None
+ assert r.item.id.startswith("msg_")
+ msg_id = r.item.id
+ if (r.type == "response.content_part.added" or
+ r.type == "response.output_text.delta" or
+ r.type == "response.output_text.done" or
+ r.type == "response.content_part.done"):
+ assert r.item_id == msg_id
+ if r.type == "response.output_item.done":
+ assert r.item.id == msg_id
+
+ if r.type == "response.output_text.delta":
+ gathered_text += r.delta
+ if r.type == "response.completed":
+ assert r.response.id.startswith("resp_")
+ assert r.response.output[0].id is not None
+ assert r.response.output[0].id.startswith("msg_")
+ assert gathered_text == r.response.output_text
+ assert match_regex("(Suddenly)+", r.response.output_text)