server : add KV cache quantization options (#5684)

author AlpinDale <redacted>

Fri, 23 Feb 2024 19:31:54 +0000 (19:31 +0000)

committer GitHub <redacted>

Fri, 23 Feb 2024 19:31:54 +0000 (21:31 +0200)
author AlpinDale <redacted>
Fri, 23 Feb 2024 19:31:54 +0000 (19:31 +0000)
committer GitHub <redacted>
Fri, 23 Feb 2024 19:31:54 +0000 (21:31 +0200)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 369121e885b27e4f7c98f6eff960f1d53f106093..524d0ada33ab0c047428180d849bb79a673fc1c9 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1948,6 +1948,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
      printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
      printf("  -spf FNAME, --system-prompt-file FNAME\n");
      printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
+    printf("  -ctk TYPE, --cache-type-k TYPE\n");
+    printf("                            KV cache data type for K (default: f16)\n");
+    printf("  -ctv TYPE, --cache-type-v TYPE\n");
+    printf("                            KV cache data type for V (default: f16)\n");
      printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
      printf("  --log-disable             disables logging to a file.\n");
      printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
@@ -2386,6 +2390,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
              );
              llama.process_system_prompt_data(json::parse(systm_content));
          }
+        else if (arg == "-ctk" || arg == "--cache-type-k") {
+            params.cache_type_k = argv[++i];
+        }
+        else if (arg == "-ctv" || arg == "--cache-type-v") {
+            params.cache_type_v = argv[++i];
+        }
          else if(arg == "--mmproj")
          {
              if (++i >= argc)
author	AlpinDale <redacted>
	Fri, 23 Feb 2024 19:31:54 +0000 (19:31 +0000)
committer	GitHub <redacted>
	Fri, 23 Feb 2024 19:31:54 +0000 (21:31 +0200)