int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
// add drafted token for each sequence
const llama_token id = cur_p->data[0].id;
- // only collect very high-confidence draft tokens
- if (cur_p->data[0].p < params.p_min) {
- break;
- }
-
common_sampler_accept(smpl, id, true);
result.push_back(id);
break;
}
+ // only collect very high-confidence draft tokens
+ if (cur_p->data[0].p < params.p_min) {
+ break;
+ }
+
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
// evaluate the drafted tokens on the draft model
int n_draft = 16; // max drafted tokens
int n_reuse = 256;
- float p_min = 0.9f; // min probability required to accept a token in the draft
+ float p_min = 0.75f; // min probability required to accept a token in the draft
};
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
- params.speculative.n_min = std::max(params.speculative.n_min, 2);
+ params.speculative.n_min = std::max(params.speculative.n_min, 0);
params.speculative.n_max = std::max(params.speculative.n_max, 0);
// Use OpenAI API logprobs only if n_probs wasn't provided