spec : fix the check-rate logic of ngram-simple (#19261)

author Georgi Gerganov <redacted>

Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)

committer GitHub <redacted>

Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)
author Georgi Gerganov <redacted>
Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)
committer GitHub <redacted>
Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp

index cab231bad7006486e3b97b1303120fbb4cd551b7..c5b8fc75ed8bf6dfc2e3898758d1fc1b6fd81c82 100644 (file)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -47,21 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start,
   * @return Vector of draft tokens, empty if no matching pattern is found
   */
  llama_tokens common_ngram_simple_draft(
-        common_ngram_simple_state & state,
+        const common_ngram_simple_config & config,
          const llama_tokens & tokens, llama_token sampled) {
  
      // Simple implementation of self-speculative decoding without a draft model.
      //
      const size_t cur_len = tokens.size();
-    // Only check every check_rate tokens to save compute
-    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
-    if (state.idx_last_check + state.config.check_rate > cur_len) {
-        llama_tokens draft_tokens;
-        return draft_tokens;
-    }
  
-    size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
-    size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
+    const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
+    const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
  
      // vector for tokens we want to verify.
      // return empty vector if there is no match.
@@ -80,9 +74,6 @@ llama_tokens common_ngram_simple_draft(
      }
      pattern.push_back(sampled); // add the last token to the pattern
  
-    // We do a search in the token history.
-    state.idx_last_check = cur_len;
-
      size_t match_pos = 0; // we ignore position 0, position 0 == no match
                            // search backwards, but skip the current match (we are currently there)
      for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
diff --git a/common/ngram-map.h b/common/ngram-map.h

index c094d513d5dfd68937c0e3d36348afb60d13ad39..9668bd5a7c5458170e3b51496c3a11fab7747ab6 100644 (file)
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -27,23 +27,9 @@ struct common_ngram_simple_config {
      uint16_t   check_rate;      // check for speculative decoding without draft model for each check_rate token
  };
  
-// current state (and config) of n-gram simple.
-struct common_ngram_simple_state {
-    common_ngram_simple_config config;
-
-    size_t idx_last_check = 0; // index of last check in context history (mutable)
-
-    common_ngram_simple_state(const common_ngram_simple_config & config)
-        : config(config) {}
-};
-
  // Searches for a n-gram in the history and checks whether a draft sequence should be generated.
-// state:              the ngram simple state to search in.
-// inp:                the tokens generated so far.
-// sampled:            the token that was just sampled.
-// draft:              vector to store the draft tokens, initially empty.
  llama_tokens common_ngram_simple_draft(
-        common_ngram_simple_state & state,
+        const common_ngram_simple_config & config,
          const llama_tokens & tokens, llama_token sampled);
  
  
diff --git a/common/speculative.cpp b/common/speculative.cpp

index 80cd31e35f3503840be6dab4c4e493c0586da640..c99b19dbfd4e77f1682caef394a6fa5c2ec1c513 100644 (file)
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
  
  // state of self-speculation (simple implementation, not ngram-map)
  struct common_speculative_state_ngram_simple : public common_speculative_state {
-    common_ngram_simple_state state;
+    common_ngram_simple_config config;
+
+    uint16_t check_id = 0; // used to control the frequency of generating drafts
  
      common_speculative_state_ngram_simple(
              enum common_speculative_type type,
-            common_ngram_simple_state state)
-        : common_speculative_state(type), state(state) {}
+            common_ngram_simple_config config)
+        : common_speculative_state(type), config(config) {}
  
      void begin(const llama_tokens & prompt) override {
          GGML_UNUSED(prompt);
@@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
              const llama_tokens & prompt_tgt,
              llama_token id_last,
              llama_tokens & result) override {
-        result = common_ngram_simple_draft(state, prompt_tgt, id_last);
+        ++check_id;
+        if (check_id < config.check_rate) {
+            return;
+        }
+        check_id = 0;
+
+        result = common_ngram_simple_draft(config, prompt_tgt, id_last);
          GGML_UNUSED(params);
      }
  
@@ -889,14 +897,14 @@ common_speculative * common_speculative_init(
                  uint16_t mgram_size_value = ngram_map.size_value;
                  uint16_t check_rate       = ngram_map.check_rate;
  
-                auto config_simple = common_ngram_simple_config{
+                auto config_simple = common_ngram_simple_config {
                      /* .size_ngram      = */ ngram_size_key,
                      /* .size_mgram      = */ mgram_size_value,
                      /* .check_rate      = */ check_rate
                  };
                  auto state = std::make_unique<common_speculative_state_ngram_simple>(
                      /* .type            = */ config.type,
-                    /* .state           = */ common_ngram_simple_state(config_simple)
+                    /* .state           = */ config_simple
                  );
                  impls.push_back(std::move(state));
                  break;
author	Georgi Gerganov <redacted>
	Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)
committer	GitHub <redacted>
	Wed, 4 Feb 2026 08:39:53 +0000 (10:39 +0200)
common/ngram-map.cpp		patch \| blob \| history
common/ngram-map.h		patch \| blob \| history
common/speculative.cpp		patch \| blob \| history