From: Georgi Gerganov Date: Sun, 9 Mar 2025 17:08:20 +0000 (+0200) Subject: server : add speculative decoding presets for FIM (#12287) X-Git-Tag: upstream/0.0.5028~170 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=1e2f78a00450593e2dfa458796fcdd9987300dfc;p=pkg%2Fggml%2Fsources%2Fllama.cpp server : add speculative decoding presets for FIM (#12287) --- diff --git a/common/arg.cpp b/common/arg.cpp index 3e549ede..b96a5678 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2571,5 +2571,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--fim-qwen-7b-spec"}, + string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-14b-spec"}, + string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; + params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + return ctx_arg; }