server : enable continuous batching by default (#6231)

author Georgi Gerganov <redacted>

Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)

committer GitHub <redacted>

Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)
author Georgi Gerganov <redacted>
Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)
committer GitHub <redacted>
Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)
diff --git a/common/common.h b/common/common.h

index 8dd8a3edc9c9401fcea5d28563297640ff5039e1..31fd401b6957bfa00b956c51ee51462f114d6219 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -139,7 +139,7 @@ struct gpt_params {
      bool interactive_first = false; // wait for user input immediately
      bool multiline_input   = false; // reverse the usage of `\`
      bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
+    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
  
      bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
      bool ignore_eos        = false; // ignore generated EOS tokens
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index cf075d6c41aa426164e99f926d8a44f238454e31..c918fabe4a9ed67c31ea3927ebd96f47e2a8df84 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1758,7 +1758,7 @@ struct server_context {
          }
  
          // process in chunks of params.n_batch
-        int32_t n_batch = llama_n_batch(ctx);
+        int32_t n_batch  = llama_n_batch(ctx);
          int32_t n_ubatch = llama_n_ubatch(ctx);
  
          // next, batch any pending prompts without exceeding n_batch
@@ -2225,7 +2225,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
      printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
      printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
      printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
      printf("  -spf FNAME, --system-prompt-file FNAME\n");
      printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
      printf("  -ctk TYPE, --cache-type-k TYPE\n");
author	Georgi Gerganov <redacted>
	Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)
committer	GitHub <redacted>
	Fri, 22 Mar 2024 11:08:28 +0000 (13:08 +0200)
common/common.h		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history