}
void release() {
- if (state == PROCESSING)
+ if (state == IDLE || state == PROCESSING)
{
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE;
}
slot->params.antiprompt.clear();
+
const auto &stop = data.find("stop");
if (stop != data.end() && stop->is_array())
{
kv_cache_clear();
- for (int32_t i = 0; i < batch.n_tokens; ++i)
+ for (int i = 0; i < (int) system_tokens.size(); ++i)
{
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
{
slot.release();
}
- wait_all_are_idle();
- all_slots_are_idle = true;
- // wait until system prompt load
system_need_update = true;
- while (system_need_update)
- {
- std::this_thread::sleep_for(std::chrono::milliseconds(5));
- }
- // system prompt loaded, continue
}
void process_system_prompt_data(const json &sys_props) {
{
notify_system_prompt_changed();
}
- else
- {
- system_need_update = true;
- }
- }
-
- void wait_all_are_idle() {
- bool wait = true;
- while (wait)
- {
- wait = false;
- for (auto &slot : slots)
- {
- if (!slot.available())
- {
- wait = true;
- break;
- }
- }
- }
}
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
slot.has_next_token = false;
}
stop_pos = pos;
-
}
}
process_tasks();
// update the system prompt wait until all slots are idle state
- if (system_need_update)
+ if (system_need_update && all_slots_are_idle)
{
LOG_TEE("updating system prompt\n");
update_system_prompt();
for (auto & slot : slots)
{
// release the slot
- if (slot.state == PROCESSING && slot.command == RELEASE)
+ if (slot.command == RELEASE)
{
slot.state = IDLE;
slot.command = NONE;
continue;
}
- if (slot.state == IDLE || slot.command == RELEASE)
+ if (slot.state == IDLE)
{
continue;
}
{
for (auto & slot : slots)
{
+ const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
+
+ // empty prompt passed -> release the slot and send empty response
+ if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
+ {
+ slot.release();
+ slot.print_timings();
+ send_final_response(slot);
+ continue;
+ }
+
// need process the prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
{
if (!process_token(result, slot))
{
slot.release();
- send_final_response(slot);
slot.print_timings();
+ send_final_response(slot);
}
slot.i_batch = -1;
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
- if(!result.error && result.stop) {
+ if (!result.error && result.stop) {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
}
else
{
return false;
}
- if(result.stop) {
+ if (result.stop) {
break;
}
} else {