// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
+// mime type for sending response
+#define MIMETYPE_JSON "application/json; charset=utf-8"
// auto generated files (update with ./deps.sh)
#include "colorthemes.css.hpp"
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
- SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum server_task_type {
add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model);
+
return true;
}
svr->set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
- svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
+ // Access-Control-Allow-Origin is already set by middleware
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*");
- return res.set_content("", "application/json; charset=utf-8");
+ return res.set_content("", "text/html"); // blank response, no data
});
svr->set_logger(log_server_request);
auto res_error = [](httplib::Response & res, json error_data) {
json final_response {{"error", error_data}};
- res.set_content(final_response.dump(), "application/json; charset=utf-8");
+ res.set_content(final_response.dump(), MIMETYPE_JSON);
res.status = json_value(error_data, "code", 500);
};
svr->set_read_timeout (params.timeout_read);
svr->set_write_timeout(params.timeout_write);
- if (!svr->bind_to_port(params.hostname, params.port)) {
- fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
- return 1;
- }
-
std::unordered_map<std::string, std::string> log_data;
log_data["hostname"] = params.hostname;
// Necessary similarity of prompt for slot selection
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
- // load the model
- if (!ctx_server.load_model(params)) {
- state.store(SERVER_STATE_ERROR);
- return 1;
- } else {
- ctx_server.init();
- state.store(SERVER_STATE_READY);
- }
-
- LOG_INFO("model loaded", {});
-
- const auto model_meta = ctx_server.model_meta();
-
- // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
- if (params.chat_template.empty()) {
- if (!ctx_server.validate_model_chat_template()) {
- LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- params.chat_template = "chatml";
- }
- }
-
- // print sample chat example to make it clear which template is used
- {
- LOG_INFO("chat template", {
- {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
- {"built_in", params.chat_template.empty()},
- });
- }
-
//
// Middlewares
//
}
// API key is invalid or not provided
- // TODO: make another middleware for CORS related logic
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
LOG_WARNING("Unauthorized: Invalid API Key", {});
return false;
};
+ auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
+ server_state current_state = state.load();
+ if (current_state == SERVER_STATE_LOADING_MODEL) {
+ res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+ return false;
+ }
+ return true;
+ };
+
// register server middlewares
- svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) {
+ svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!middleware_server_state(req, res)) {
+ return httplib::Server::HandlerResponse::Handled;
+ }
if (!middleware_validate_api_key(req, res)) {
return httplib::Server::HandlerResponse::Handled;
}
// Route handlers (or controllers)
//
- const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
- server_state current_state = state.load();
- switch (current_state) {
- case SERVER_STATE_READY:
- {
- // request slots data using task queue
- server_task task;
- task.id = ctx_server.queue_tasks.get_new_id();
- task.type = SERVER_TASK_TYPE_METRICS;
- task.id_target = -1;
-
- ctx_server.queue_results.add_waiting_task_id(task.id);
- ctx_server.queue_tasks.post(task);
-
- // get the result
- server_task_result result = ctx_server.queue_results.recv(task.id);
- ctx_server.queue_results.remove_waiting_task_id(task.id);
-
- const int n_idle_slots = result.data.at("idle");
- const int n_processing_slots = result.data.at("processing");
-
- json health = {
- {"status", "ok"},
- {"slots_idle", n_idle_slots},
- {"slots_processing", n_processing_slots}
- };
-
- res.status = 200; // HTTP OK
- if (params.endpoint_slots && req.has_param("include_slots")) {
- health["slots"] = result.data.at("slots");
- }
-
- if (n_idle_slots == 0) {
- health["status"] = "no slot available";
- if (req.has_param("fail_on_no_slot")) {
- res.status = 503; // HTTP Service Unavailable
- }
- }
-
- res.set_content(health.dump(), "application/json");
- break;
- }
- case SERVER_STATE_LOADING_MODEL:
- {
- res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
- } break;
- case SERVER_STATE_ERROR:
- {
- res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
- } break;
- }
+ const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
+ // error and loading states are handled by middleware
+ json health = {{"status", "ok"}};
+ res.set_content(health.dump(), "application/json");
};
- const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
+ const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
if (!params.endpoint_slots) {
- res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
+ res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
- res.set_content(result.data.at("slots").dump(), "application/json");
+ // optionally return "fail_on_no_slot" error
+ const int n_idle_slots = result.data.at("idle");
+ if (req.has_param("fail_on_no_slot")) {
+ if (n_idle_slots == 0) {
+ res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
+ return;
+ }
+ }
+
+ res.set_content(result.data.at("slots").dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
if (!params.endpoint_metrics) {
- res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
+ res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
if (result.error) {
res_error(res, result.data);
} else {
- res.set_content(result.data.dump(), "application/json");
+ res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
if (result.error) {
res_error(res, result.data);
} else {
- res.set_content(result.data.dump(), "application/json");
+ res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
if (result.error) {
res_error(res, result.data);
} else {
- res.set_content(result.data.dump(), "application/json");
+ res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
std::string id_slot_str = req.path_params.at("id_slot");
int id_slot;
}
};
- const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ const auto handle_props = [&ctx_server](const httplib::Request &, httplib::Response & res) {
std::string template_key = "tokenizer.chat_template", curr_tmpl;
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
if (tlen > 0) {
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
}
}
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
{ "system_prompt", ctx_server.system_prompt.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "chat_template", curr_tmpl.c_str() }
};
- res.set_content(data.dump(), "application/json; charset=utf-8");
+ res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
return;
}
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
json data = json::parse(req.body);
const int id_task = ctx_server.queue_tasks.get_new_id();
if (!json_value(data, "stream", false)) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
- res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
}
};
- const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
+ const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) {
json models = {
{"object", "list"},
{"data", {
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
- {"meta", model_meta}
+ {"meta", ctx_server.model_meta()}
},
}}
};
- res.set_content(models.dump(), "application/json; charset=utf-8");
+ res.set_content(models.dump(), MIMETYPE_JSON);
};
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
-
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
const int id_task = ctx_server.queue_tasks.get_new_id();
if (!result.error && result.stop) {
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
- res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
return;
}
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
json data = json::parse(req.body);
const int id_task = ctx_server.queue_tasks.get_new_id();
if (!json_value(data, "stream", false)) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
- res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
};
const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
tokens = ctx_server.tokenize(body.at("content"), add_special);
}
const json data = format_tokenizer_response(tokens);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
+ return res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::string content;
}
const json data = format_detokenized_response(content);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
+ return res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
const json body = json::parse(req.body);
bool is_openai = false;
json root = is_openai
? format_embeddings_response_oaicompat(body, responses)
: responses[0];
- return res.set_content(root.dump(), "application/json; charset=utf-8");
+ return res.set_content(root.dump(), MIMETYPE_JSON);
};
- const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
json result = json::array();
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
auto & la = ctx_server.lora_adapters[i];
{"scale", la.scale},
});
}
- res.set_content(result.dump(), "application/json");
+ res.set_content(result.dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
const std::vector<json> body = json::parse(req.body);
int max_idx = ctx_server.lora_adapters.size();
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
- res.set_content(result.data.dump(), "application/json");
+ res.set_content(result.data.dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
- LOG_INFO("HTTP server listening", log_data);
+ // clean up function, to be called before exit
+ auto clean_up = [&svr]() {
+ svr->stop();
+ llama_backend_free();
+ };
- // run the HTTP server in a thread - see comment below
- std::thread t([&]() {
- if (!svr->listen_after_bind()) {
- state.store(SERVER_STATE_ERROR);
- return 1;
+ // bind HTTP listen port, run the HTTP server in a thread
+ if (!svr->bind_to_port(params.hostname, params.port)) {
+ LOG_ERROR("couldn't bind HTTP server socket", {
+ {"hostname", params.hostname},
+ {"port", params.port},
+ });
+ clean_up();
+ LOG_ERROR("exiting due to HTTP server error", {});
+ return 1;
+ }
+ std::thread t([&]() { svr->listen_after_bind(); });
+ svr->wait_until_ready();
+
+ LOG_INFO("HTTP server is listening", log_data);
+
+ // load the model
+ LOG_INFO("loading model", log_data);
+ if (!ctx_server.load_model(params)) {
+ clean_up();
+ t.join();
+ LOG_ERROR("exiting due to model loading error", {});
+ return 1;
+ } else {
+ ctx_server.init();
+ state.store(SERVER_STATE_READY);
+
+ LOG_INFO("model loaded", {});
+
+ // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+ if (params.chat_template.empty()) {
+ if (!ctx_server.validate_model_chat_template()) {
+ LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+ params.chat_template = "chatml";
+ }
}
- return 0;
- });
+ // print sample chat example to make it clear which template is used
+ {
+ LOG_INFO("chat template", {
+ {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
+ {"built_in", params.chat_template.empty()},
+ });
+ }
- ctx_server.queue_tasks.on_new_task(std::bind(
- &server_context::process_single_task, &ctx_server, std::placeholders::_1));
- ctx_server.queue_tasks.on_finish_multitask(std::bind(
- &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
- ctx_server.queue_tasks.on_update_slots(std::bind(
- &server_context::update_slots, &ctx_server));
- ctx_server.queue_results.on_multitask_update(std::bind(
- &server_queue::update_multitask,
- &ctx_server.queue_tasks,
- std::placeholders::_1,
- std::placeholders::_2,
- std::placeholders::_3
- ));
-
- shutdown_handler = [&](int) {
- ctx_server.queue_tasks.terminate();
- };
+ ctx_server.queue_tasks.on_new_task(std::bind(
+ &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_finish_multitask(std::bind(
+ &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_update_slots(std::bind(
+ &server_context::update_slots, &ctx_server));
+ ctx_server.queue_results.on_multitask_update(std::bind(
+ &server_queue::update_multitask,
+ &ctx_server.queue_tasks,
+ std::placeholders::_1,
+ std::placeholders::_2,
+ std::placeholders::_3
+ ));
+
+ shutdown_handler = [&](int) {
+ ctx_server.queue_tasks.terminate();
+ };
+ ctx_server.queue_tasks.start_loop();
+ }
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
- ctx_server.queue_tasks.start_loop();
-
- svr->stop();
+ clean_up();
t.join();
- llama_backend_free();
-
return 0;
}
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
match expecting_status:
case 'healthy':
- await wait_for_health_status(context, context.base_url, 200, 'ok',
- timeout=30)
+ await wait_for_slots_status(context, context.base_url, 200,
+ timeout=30)
case 'ready' | 'idle':
- await wait_for_health_status(context, context.base_url, 200, 'ok',
- timeout=30,
- params={'fail_on_no_slot': 0, 'include_slots': 0},
- slots_idle=context.n_slots,
- slots_processing=0,
- expected_slots=[{'id': slot_id, 'state': 0}
- for slot_id in
- range(context.n_slots if context.n_slots else 1)])
+ await wait_for_slots_status(context, context.base_url, 200,
+ timeout=30,
+ params={'fail_on_no_slot': 1},
+ slots_idle=context.n_slots,
+ slots_processing=0)
case 'busy':
- await wait_for_health_status(context, context.base_url, 503,
- 'no slot available',
- params={'fail_on_no_slot': 0, 'include_slots': 0},
- slots_idle=0,
- slots_processing=context.n_slots,
- expected_slots=[{'id': slot_id, 'state': 1}
- for slot_id in
- range(context.n_slots if context.n_slots else 1)])
+ await wait_for_slots_status(context, context.base_url, 503,
+ params={'fail_on_no_slot': 1},
+ slots_idle=0,
+ slots_processing=context.n_slots)
case _:
assert False, "unknown status"
return n_completions
-async def wait_for_health_status(context,
- base_url,
- expected_http_status_code,
- expected_health_status,
- timeout=3,
- params=None,
- slots_idle=None,
- slots_processing=None,
- expected_slots=None):
+async def wait_for_slots_status(context,
+ base_url,
+ expected_http_status_code,
+ timeout=3,
+ params=None,
+ slots_idle=None,
+ slots_processing=None):
if context.debug:
- print(f"Starting checking for health for expected_health_status={expected_health_status}")
+ print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
interval = 0.5
counter = 0
if 'GITHUB_ACTIONS' in os.environ:
async with aiohttp.ClientSession() as session:
while True:
- async with await session.get(f'{base_url}/health', params=params) as health_response:
- status_code = health_response.status
- health = await health_response.json()
+ async with await session.get(f'{base_url}/slots', params=params) as slots_response:
+ status_code = slots_response.status
+ slots = await slots_response.json()
if context.debug:
- print(f"HEALTH - response for expected health status='{expected_health_status}' on "
- f"'{base_url}/health'?{params} is {health}\n")
- if (status_code == expected_http_status_code
- and health['status'] == expected_health_status
- and (slots_idle is None or health['slots_idle'] == slots_idle)
- and (slots_processing is None or health['slots_processing'] == slots_processing)):
- if expected_slots is not None:
- assert_slots_status(health['slots'], expected_slots)
- return
- if (status_code == expected_http_status_code
- and health['status'] == expected_health_status
- and (slots_idle is None or health['slots_idle'] == slots_idle)
- and (slots_processing is None or health['slots_processing'] == slots_processing)):
- if expected_slots is not None:
- assert_slots_status(health['slots'], expected_slots)
+ print(f"slots responses {slots}\n")
+ if status_code == 503 and status_code == expected_http_status_code:
return
+ if status_code == 200 and status_code == expected_http_status_code:
+ n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
+ n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+ if ((slots_idle is None or slots_idle == n_slots_idle)
+ and (slots_processing is None or slots_processing == n_slots_processing)):
+ return
await asyncio.sleep(interval)
counter += interval
if n_completions > 0:
return
- assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
+ assert False, f'slots check timeout exceeded {counter}s>={timeout}'
def assert_embeddings(embeddings):