server : improve infill context reuse (#9894)

author Georgi Gerganov <redacted>

Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)

committer GitHub <redacted>

Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)
author Georgi Gerganov <redacted>
Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)
committer GitHub <redacted>
Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)
diff --git a/examples/server/README.md b/examples/server/README.md

index eb0a7b32ef8890dc644f75f192fcec21bc2b31f4..fcdb02afd3b93f60b447195cdb3cabcaf38007c3 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -524,10 +524,12 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
  
  - `input_prefix`: Set the prefix of the code to infill.
  - `input_suffix`: Set the suffix of the code to infill.
-- `prompt`: Added after the `FIM_MID` token
-- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
+- `input_extra`:  Additional context inserted before the FIM prefix.
+- `prompt`:       Added after the `FIM_MID` token
  
-It also accepts all the options of `/completion`.
+`input_extra` is array of `{"filename": string, "text": string}` objects.
+
+The endpoint also accepts all the options of `/completion`.
  
  If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
  
@@ -545,7 +547,7 @@ If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](
  If the tokens are missing, then the extra context is simply prefixed at the start:
  
  ```txt
-[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
  ```
  
  ### **GET** `/props`: Get server global properties.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 8d4380e12f35af78248fbc3147e7345c5d1db261..d53cca84ce362d5c22386f3d13d58b6e1a6c4a51 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -136,10 +136,6 @@ struct slot_params {
      int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
  
      std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-    json extra_context;
  };
  
  struct server_slot {
@@ -169,6 +165,10 @@ struct server_slot {
  
      json prompt; // can be either a string, array of strings or array of token ids
  
+    json input_prefix;
+    json input_suffix;
+    json input_extra;
+
      // when a task is submitted, we first tokenize the prompt and store it here
      std::vector<llama_token> prompt_tokens;
      std::vector<llama_token> extra_tokens;
@@ -910,12 +910,12 @@ struct server_context {
          }
  
          // infill
-        slot.params.input_prefix  = json_value(data, "input_prefix",  default_params.input_prefix);
-        slot.params.input_suffix  = json_value(data, "input_suffix",  default_params.input_suffix);
-        slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
+        slot.input_prefix = json_value(data, "input_prefix", json());
+        slot.input_suffix = json_value(data, "input_suffix", json());
+        slot.input_extra  = json_value(data, "input_extra",  json());
  
-        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
-        for (const auto & chunk : slot.params.extra_context) {
+        SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
+        for (const auto & chunk : slot.input_extra) {
              // { "text": string, "filename": string }
              if (!chunk.contains("text") || !chunk["text"].is_string()) {
                  send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
@@ -932,7 +932,7 @@ struct server_context {
          }
  
          // get prompt
-        if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
+        {
              const auto & prompt = data.find("prompt");
              if (prompt == data.end()) {
                  send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@@ -1958,6 +1958,8 @@ struct server_context {
                                  } break;
                              case SERVER_TASK_CMPL_TYPE_INFILL:
                                  {
+                                    // TODO: optimize this block by reducing memory allocations and movement
+
                                      // use FIM repo-level pattern:
                                      // ref: https://arxiv.org/pdf/2409.12186
                                      //
@@ -1968,10 +1970,11 @@ struct server_context {
                                      // extra chunk 1
                                      // ...
                                      // [FIM_SEP]filename
-                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
+                                    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
                                      //
-                                    auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
-                                    auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
+                                    auto tokens_prefix = tokenize(slot.input_prefix, false, false);
+                                    auto tokens_suffix = tokenize(slot.input_suffix, false, false);
+                                    auto tokens_prompt = tokenize(slot.prompt,       false, false);
  
                                      slot.extra_tokens.clear();
                                      if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
@@ -1981,7 +1984,7 @@ struct server_context {
                                          slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
                                      }
  
-                                    for (const auto & chunk : slot.params.extra_context) {
+                                    for (const auto & chunk : slot.input_extra) {
                                          // { "text": string, "filename": string }
                                          const std::string text     = chunk.value("text", "");
                                          const std::string filename = chunk.value("filename", "tmp");
@@ -2012,20 +2015,21 @@ struct server_context {
                                      }
  
                                      // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
-                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
+                                    const int n_suffix_take = std::min<int>(tokens_suffix.size(),   (n_batch/4));
+                                    const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
  
                                      // fill the rest of the context with extra chunks
                                      const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
  
-                                    prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
-                                    suffix_tokens.resize(n_suffix_take);
+                                    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+                                    tokens_suffix.resize(n_suffix_take);
  
-                                    prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
-                                    suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
+                                    tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
+                                    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+                                    tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
  
-                                    auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
-                                    auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                                    auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
+                                    auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
  
                                      if (llama_add_bos_token(model)) {
                                          embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
@@ -2140,40 +2144,17 @@ struct server_context {
  
                                      while (head_c < slot.cache_tokens.size() &&
                                             head_p < prompt_tokens.size()) {
-                                        if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
-                                            slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }
-
-                                        if (llama_token_is_control(model, prompt_tokens[head_p]) &&
-                                            prompt_tokens[head_p] != llama_token_fim_rep(model) &&
-                                            prompt_tokens[head_p] != llama_token_fim_sep(model)) {
-                                            break;
-                                        }
  
                                          size_t n_match = 0;
-
                                          while (head_c + n_match < slot.cache_tokens.size() &&
                                                 head_p + n_match < prompt_tokens.size()     &&
                                                 slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
-                                            if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
-                                                slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }
-
-                                            if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
-                                                prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
-                                                break;
-                                            }
  
                                              n_match++;
                                          }
  
                                          if (n_match >= (size_t) params.n_cache_reuse) {
-                                            SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                              //for (size_t i = head_p; i < head_p + n_match; i++) {
                                              //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
                                              //}
author	Georgi Gerganov <redacted>
	Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)
committer	GitHub <redacted>
	Tue, 15 Oct 2024 13:28:55 +0000 (16:28 +0300)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history