server : enable cache_prompt by default (#10501)

author Georgi Gerganov <redacted>

Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)

committer GitHub <redacted>

Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)
author Georgi Gerganov <redacted>
Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)
committer GitHub <redacted>
Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 0936e0b7ba13a7b3e0953f2b0e87c11cbe5c5d55..877768c8b0bd2f901317a6f1c55efa541011352a 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -412,7 +412,7 @@ node index.js
  
      `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
  
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
  
      `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 8684771e2d0bbd2f08aaa53179eef6e01102f8ff..c0ea4faf77d42d6925a4d3ea9db2a848bb3ff594 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -111,7 +111,7 @@ struct server_static_file {
  
  struct slot_params {
      bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
  
      int32_t n_keep    =  0; // number of tokens to keep from initial prompt
      int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -883,7 +883,7 @@ struct server_context {
          }
  
          slot.params.stream           = json_value(data, "stream",             false);
-        slot.params.cache_prompt     = json_value(data, "cache_prompt",       false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
          slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
          slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
          slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
author	Georgi Gerganov <redacted>
	Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)
committer	GitHub <redacted>
	Mon, 25 Nov 2024 19:50:07 +0000 (21:50 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history