common : improve -ctv -ctk CLI arguments (#10806)

author Xuan Son Nguyen <redacted>

Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)

committer GitHub <redacted>

Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)
author Xuan Son Nguyen <redacted>
Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)
committer GitHub <redacted>
Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index b27567f3b532718406a798f848ec44b9c616b4dd..39bc874c818bbab74285c2858c25bd471157e3d7 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
      }
  }
  
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
  //
  // CLI argument parsing functions
  //
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
      add_opt(common_arg(
          {"-ctk", "--cache-type-k"}, "TYPE",
-        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        string_format(
+            "KV cache data type for K\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_k)
+        ),
          [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_k = value;
+            params.cache_type_k = kv_cache_type_from_str(value);
          }
      ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
      add_opt(common_arg(
          {"-ctv", "--cache-type-v"}, "TYPE",
-        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        string_format(
+            "KV cache data type for V\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_v)
+        ),
          [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_v = value;
+            params.cache_type_v = kv_cache_type_from_str(value);
          }
      ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
      add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp

index 6143516d2250f49f328d58f760bf7a7c54001cb6..3cd43ecdf8db263b8ac4baa597ff40dd102704b7 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
      return mparams;
  }
  
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "bf16") {
-        return GGML_TYPE_BF16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
  struct llama_context_params common_context_params_to_llama(const common_params & params) {
      auto cparams = llama_context_default_params();
  
@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
          cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
      }
  
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;
  
      return cparams;
  }
diff --git a/common/common.h b/common/common.h

index 95d20401d2a9afb1d1144d97d8aa8b984efee344..0481720ab92e9d31baa1c5a4008c926eaaefdb99 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -286,8 +286,8 @@ struct common_params {
      bool warmup            = true;  // warmup run
      bool check_tensors     = false; // validate tensor data
  
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
  
      // multimodal models (see examples/llava)
      std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt

index b2df4f2fb6cbcea616a2753484bd7ea4f6adfee5..21b31392e81d087610256c39d215b806b7b17825 100644 (file)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,6 +51,7 @@ else()
      add_subdirectory(speculative)
      add_subdirectory(speculative-simple)
      add_subdirectory(tokenize)
+    add_subdirectory(gen-docs)
      if (NOT GGML_BACKEND_DL)
          # these examples use the backends directly and cannot be built with dynamic loading
          add_subdirectory(convert-llama2c-to-ggml)
diff --git a/examples/server/README.md b/examples/server/README.md

index 686be7baf24a84f2eaf9fab8115febd44bd76c6e..a9443e56b82cc8eccf717f1b1b9c49b851dfc869 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
  | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
  | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
  | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
  | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
  | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
  | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
  | -------- | ----------- |
  | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
  | `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
  | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
  | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
  | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@@ -146,7 +147,7 @@ The project is under active development, and we are [looking for feedback and co
  | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
  | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
  | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
-| `--no-webui` | disable the Web UI<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
  | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
  | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
  | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -164,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
  | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
  | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
  | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
  | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
  
  
  Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
author	Xuan Son Nguyen <redacted>
	Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)
committer	GitHub <redacted>
	Thu, 12 Dec 2024 21:53:05 +0000 (22:53 +0100)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/CMakeLists.txt		patch \| blob \| history
examples/server/README.md		patch \| blob \| history