common : add missing env var for speculative (#10801)

author Xuan Son Nguyen <redacted>

Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)

committer GitHub <redacted>

Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)
author Xuan Son Nguyen <redacted>
Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)
committer GitHub <redacted>
Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index 49af31682510d3270690bcb4077a6fdc2195a497..b27567f3b532718406a798f848ec44b9c616b4dd 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2083,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          [](common_params & params, int value) {
              params.speculative.n_max = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
      add_opt(common_arg(
          {"--draft-min", "--draft-n-min"}, "N",
          string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
          [](common_params & params, int value) {
              params.speculative.n_min = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
      add_opt(common_arg(
          {"--draft-p-split"}, "P",
          string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
          [](common_params & params, const std::string & value) {
              params.speculative.p_split = std::stof(value);
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
      add_opt(common_arg(
          {"--draft-p-min"}, "P",
          string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
          [](common_params & params, const std::string & value) {
              params.speculative.p_min = std::stof(value);
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
      add_opt(common_arg(
          {"-cd", "--ctx-size-draft"}, "N",
          string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
          [](common_params & params, int value) {
              params.speculative.n_ctx = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
      add_opt(common_arg(
          {"-devd", "--device-draft"}, "<dev1,dev2,..>",
          "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2131,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                  fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
              }
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
      add_opt(common_arg(
          {"-md", "--model-draft"}, "FNAME",
          "draft model for speculative decoding (default: unused)",
          [](common_params & params, const std::string & value) {
              params.speculative.model = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
  
      return ctx_arg;
  }
author	Xuan Son Nguyen <redacted>
	Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)
committer	GitHub <redacted>
	Thu, 12 Dec 2024 15:57:32 +0000 (16:57 +0100)