server : fix infill when prompt is empty (#4833)

author Georgi Gerganov <redacted>

Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)

committer GitHub <redacted>

Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)
author Georgi Gerganov <redacted>
Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)
committer GitHub <redacted>
Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 031824e145411c601a4d25c19a0e0485ab94979a..1d30a15a6cc1e084c7f754645a3407865aee81d7 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1406,7 +1406,7 @@ struct llama_server_context
          task.multitask_id = multitask_id;
  
          // when a completion task's prompt array is not a singleton, we split it into multiple requests
-        if (task.data.at("prompt").size() > 1)
+        if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
          {
              lock.unlock(); // entering new func scope
              return split_multiprompt_task(task);
@@ -1577,9 +1577,9 @@ struct llama_server_context
  
                      slot->reset();
  
-                    slot->infill = task.infill_mode;
-                    slot->embedding = task.embedding_mode;
-                    slot->task_id = task.id;
+                    slot->infill       = task.infill_mode;
+                    slot->embedding    = task.embedding_mode;
+                    slot->task_id      = task.id;
                      slot->multitask_id = task.multitask_id;
  
                      if (!launch_slot_with_data(slot, task.data))
@@ -1731,7 +1731,8 @@ struct llama_server_context
                  const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
  
                  // empty prompt passed -> release the slot and send empty response
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
+                // note: infill mode allows empty prompt
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
                  {
                      slot.release();
                      slot.print_timings();
@@ -2609,8 +2610,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu
              {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
              {"usage",
                  json{{"completion_tokens", num_tokens_predicted},
-                    {"prompt_tokens", num_prompt_tokens},
-                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+                     {"prompt_tokens",     num_prompt_tokens},
+                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
              {"id", gen_chatcmplid()}};
  
      if (server_verbose) {
author	Georgi Gerganov <redacted>
	Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)
committer	GitHub <redacted>
	Thu, 11 Jan 2024 21:23:49 +0000 (23:23 +0200)