server : --n-predict option document and cap to max value (#5549)

author Pierrick Hymbert <redacted>

Sun, 18 Feb 2024 16:30:09 +0000 (17:30 +0100)

committer GitHub <redacted>

Sun, 18 Feb 2024 16:30:09 +0000 (18:30 +0200)
author Pierrick Hymbert <redacted>
Sun, 18 Feb 2024 16:30:09 +0000 (17:30 +0100)
committer GitHub <redacted>
Sun, 18 Feb 2024 16:30:09 +0000 (18:30 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 249368749ff07b2e8503b900eda4f1df39d1a6f3..fe5cd8d5d138207a2f1e4671d625008edde145bf 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
  - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
  - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
  - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
+- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
  
  ## Build
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 7800c6e7e67e8d19939648bfc54ab15dfd481475..7aa706e9530798d9530915a7136838e726704d32 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -159,6 +159,7 @@ struct llama_client_slot
      int32_t n_decoded   = 0;
      int32_t n_remaining = -1;
      int32_t i_batch     = -1;
+    int32_t n_predict   = -1;
  
      int32_t num_prompt_tokens           = 0;
      int32_t num_prompt_tokens_processed = 0;
@@ -410,6 +411,7 @@ struct llama_server_context
  
              slot.id = i;
              slot.n_ctx = n_ctx_slot;
+            slot.n_predict = params.n_predict;
  
              LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
  
@@ -546,6 +548,15 @@ struct llama_server_context
          slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
          slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
  
+        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+            // Might be better to reject the request with a 400 ?
+            LOG_WARNING("Max tokens to predict exceeds server configuration", {
+                {"params.n_predict", slot->params.n_predict},
+                {"slot.n_predict", slot->n_predict},
+            });
+            slot->params.n_predict = slot->n_predict;
+        }
+
          // infill
          if (data.count("input_prefix") != 0)
          {
@@ -1053,6 +1064,7 @@ struct llama_server_context
  
          return json {
              {"n_ctx",             slot.n_ctx},
+            {"n_predict",         slot.n_predict},
              {"model",             params.model_alias},
              {"seed",              slot.params.seed},
              {"temperature",       slot.sparams.temp},
@@ -1915,13 +1927,14 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
      printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
      printf("  --log-disable             disables logging to a file.\n");
      printf("\n");
+    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
      printf("  --override-kv KEY=TYPE:VALUE\n");
      printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
      printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
      printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
      printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
      printf("  --chat-template FORMAT_NAME");
-    printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
      printf("\n");
  }
author	Pierrick Hymbert <redacted>
	Sun, 18 Feb 2024 16:30:09 +0000 (17:30 +0100)
committer	GitHub <redacted>
	Sun, 18 Feb 2024 16:30:09 +0000 (18:30 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history