speculative : update default params (#11954)

author Georgi Gerganov <redacted>

Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)

committer GitHub <redacted>

Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)
author Georgi Gerganov <redacted>
Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)
committer GitHub <redacted>
Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)
diff --git a/common/common.h b/common/common.h

index 10bcc10d51bb510633a0449ae48ea7f134ffe500..efe8e7f79652174a15af587ddfe777784e4cb341 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -178,10 +178,10 @@ struct common_params_speculative {
  
      int32_t n_ctx        =     0; // draft context size
      int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
      int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
      float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
  
      struct cpu_params cpuparams;
      struct cpu_params cpuparams_batch;
diff --git a/common/speculative.cpp b/common/speculative.cpp

index 318e96ea35468c98379330610a5b5640a4541af7..b1fff27a55f91eaeaabc6f1118cbab1b75ea9b5a 100644 (file)
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
          // add drafted token for each sequence
          const llama_token id = cur_p->data[0].id;
  
-        // only collect very high-confidence draft tokens
-        if (cur_p->data[0].p < params.p_min) {
-            break;
-        }
-
          common_sampler_accept(smpl, id, true);
  
          result.push_back(id);
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
              break;
          }
  
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
          common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
  
          // evaluate the drafted tokens on the draft model
diff --git a/common/speculative.h b/common/speculative.h

index 2baf99fc78ce11331485d01d806e1a73f95be803..2b51a70ca1f72ca56ffbcaddfadea33ccd489cbb 100644 (file)
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -9,7 +9,7 @@ struct common_speculative_params {
      int n_draft = 16;  // max drafted tokens
      int n_reuse = 256;
  
-    float p_min = 0.9f; // min probability required to accept a token in the draft
+    float p_min = 0.75f; // min probability required to accept a token in the draft
  };
  
  struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 809bfe0e36cd7ee4d21da8fb14484b0fa701817d..2306dc26fe43121627fbd7c7880bd1f81f01d8be 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -274,7 +274,7 @@ struct server_task {
          params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
  
          params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
-        params.speculative.n_min = std::max(params.speculative.n_min, 2);
+        params.speculative.n_min = std::max(params.speculative.n_min, 0);
          params.speculative.n_max = std::max(params.speculative.n_max, 0);
  
          // Use OpenAI API logprobs only if n_probs wasn't provided
author	Georgi Gerganov <redacted>
	Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)
committer	GitHub <redacted>
	Wed, 19 Feb 2025 11:29:42 +0000 (13:29 +0200)
common/common.h		patch \| blob \| history
common/speculative.cpp		patch \| blob \| history
common/speculative.h		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history