spec : various improvements ton ngram-map + docs (#19253)

author Sascha Rogmann <redacted>

Mon, 2 Feb 2026 06:26:58 +0000 (07:26 +0100)

committer GitHub <redacted>

Mon, 2 Feb 2026 06:26:58 +0000 (08:26 +0200)
author Sascha Rogmann <redacted>
Mon, 2 Feb 2026 06:26:58 +0000 (07:26 +0100)
committer GitHub <redacted>
Mon, 2 Feb 2026 06:26:58 +0000 (08:26 +0200)
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp

index 84fd761367d37475481b665c76031f13ef397b41..cab231bad7006486e3b97b1303120fbb4cd551b7 100644 (file)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -7,6 +7,18 @@
  #include <cstdio>
  #include <sstream>
  
+// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
+#define LCG_FACTOR 2654435761UL
+
+// Compute the LCG hash of a n-gram of size len at offset start.
+static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
+    uint32_t hash = 0;
+    for (size_t i = 0; i < len; ++i) {
+        hash = hash * LCG_FACTOR + tokens[start + i];
+    }
+    return hash;
+}
+
  // Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
  static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
      std::ostringstream oss;
@@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft(
  // maximum number of counted values of a ngram map value.
  #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
  
+void common_ngram_map_begin(
+    common_ngram_map & map, const llama_tokens & tokens) {
+    size_t size_begin = tokens.size();
+
+    LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
+            map.idx_last_check, size_begin, map.keys.size());
+
+    size_t count_map_entries_upd = 0;
+    if (!map.key_map.empty() && size_begin < map.idx_last_check) {
+        if (map.show_key_map_stats) {
+            // Print statistics of hash map map_key.
+            size_t count_nonzero = 0;
+            uint32_t min_idx = UINT32_MAX;
+            uint32_t max_idx = 0;
+            for (size_t i = 0; i < map.key_map.size(); ++i) {
+                uint32_t key_idx = map.key_map[i];
+                if (key_idx != 0) {
+                    ++count_nonzero;
+                    if (key_idx < min_idx) min_idx = key_idx;
+                    if (key_idx > max_idx) max_idx = key_idx;
+                }
+            }
+            if (count_nonzero == 0) {
+                min_idx = 0;
+            }
+            LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
+                    __func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
+        }
+
+        // Update the map from hash to key index (clear outdated entries).
+        for (size_t i = 0; i < map.key_map.size(); ++i) {
+            uint32_t key_idx = map.key_map[i];
+            if (key_idx >= map.size_last_begin) {
+                map.key_map[i] = 0;
+                count_map_entries_upd++;
+            }
+        }
+        map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    }
+
+    if (size_begin < map.idx_last_check && !map.keys.empty()) {
+        // The next token generation will start at index size_begin.
+        // The tokens between map.size_last_begin and size_begin are no longer valid.
+        //
+        // Refresh map: Remove all entries with index >= map.size_last_begin.
+        size_t count_keys = map.keys.size();
+        size_t count_keys_del = 0;
+        size_t count_values_del = 0;
+        for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
+            common_ngram_map_key & key = map.keys[i];
+            if (key.key_idx >= map.size_last_begin) {
+                // Delete the key.
+                LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
+                map.keys.erase(map.keys.begin() + i);
+                count_keys_del++;
+                continue;
+            }
+            if (map.key_only) {
+                continue;
+            }
+
+            // Check the indices of the values.
+            for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
+                common_ngram_map_value & value = key.values[j];
+                if (value.value_idx >= map.size_last_begin) {
+                    // Delete the value.
+                    count_values_del++;
+
+                    // Move all values after this value to the left.
+                    for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
+                        key.values[k] = key.values[k + 1];
+                    }
+                    // Clear the last value.
+                    key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
+                    key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
+                }
+            }
+            if (key.values[0].value_idx == 0) {
+                // No values left, delete the key.
+                LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
+                map.keys.erase(map.keys.begin() + i);
+                count_keys_del++;
+            }
+        }
+
+        LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
+                map.idx_last_check, size_begin,
+                count_keys, count_keys_del, count_values_del, count_map_entries_upd);
+    }
+
+    map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    map.size_last_begin = size_begin;
+}
+
  void common_ngram_map_draft(common_ngram_map & map,
          const llama_tokens & inp, llama_token sampled,
          llama_tokens & draft) {
@@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map,
      if (cur_len < static_cast<size_t>(2 * n + m)) {
          return;
      }
+    if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
+        // key_map uses uint32_t instead of size_t.
+        GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
+    }
  
      // Only check every check_rate tokens to save compute
      // i.e., perform check if (cur_len - idx_last_check) >= check_rate
@@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map,
  
      // search for the key in the map
      size_t match_pos = 0;
-    for (size_t j = cur_len - n - m - 1; j > 0; --j) {
-        bool match = true;
-        for (size_t k = 0; k < n; ++k) {
-            if (inp[j + k] != key_tokens[k]) {
-                match = false;
-                break;
+    if (map.size_last_begin > cur_len) {
+        GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
+    }
+    if (!map.key_map.empty()) {
+        // Search for the key in the map key_map from hash of ngrams to index of ngram.
+        uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
+        uint32_t idx_key = map.key_map[idx_hash];
+        if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
+            // Check if the key matches the key at idx_key (because of possible collisions).
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[idx_key + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
+            if (match) {
+                match_pos = idx_key;
              }
          }
-        if (match) {
-           match_pos = j;
-           break;
+    }
+    if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
+        // Search for the key in [1, map.size_last_begin - n - m -1], descending.
+        for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
+            // Check if the key matches the key.
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[j + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            if (match) {
+               match_pos = j;
+               break;
+            }
+        }
+    }
+    if (match_pos == 0) {
+        // In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
+        //
+        // Search in [size_last_begin, cur_len - n - m - 1], descending.
+        for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[j + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            if (match) {
+               match_pos = j;
+               break;
+            }
          }
      }
      if (match_pos > 0) {
-        LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
+        LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
              cur_len, n, m, key_tokens.size(), sampled, match_pos);
      }
  
+    if (!map.key_map.empty()) {
+        // Add hashes of new ngrams in key_map.
+        //
+        // Use the same order as above.
+        if (map.size_last_begin > (size_t) (n + m + 1)) {
+            for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
+                // compute hash and store index of ngram at idx j in the map.
+                uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
+                if (map.key_map[idx_hash] == 0) {
+                    map.key_map[idx_hash] = j; // collisions may occur
+                }
+            }
+        }
+
+        for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
+            // compute hash and store index of ngram at idx j in the map.
+            uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
+            if (map.key_map[idx_hash] == 0) {
+                map.key_map[idx_hash] = j;
+            }
+        }
+        map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
+    }
+
      if (match_pos == 0) {
          return;
      }
@@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map,
              draft.push_back(inp[match_pos + n + i]);
          }
  
-        LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
-                key_offset, curr_key.key_num, draft.size());
+        LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
+                curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
  
          map.last_draft_created   = false;
          map.last_draft_key_idx   = key_offset;
@@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map,
          }
      }
  
-    if (sum_occur > 0 && max_occur < 3 * sum_occur) {
+    if (sum_occur > 0 && max_occur < 2 * sum_occur) {
          // The most frequent value is not much more frequent than the other values.
          // We do not use the draft.
          return;
diff --git a/common/ngram-map.h b/common/ngram-map.h

index b365034ac51afb5c7fc252121ef493240a8ff5ce..c094d513d5dfd68937c0e3d36348afb60d13ad39 100644 (file)
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -9,6 +9,8 @@
  // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
  //    The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
  //
+// ref: https://github.com/ggml-org/llama.cpp/pull/18471
+//
  
  #include "llama.h"
  #include "common.h"
@@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft(
  // maximum number of m-gram values stored for each key n-gram.
  #define COMMON_NGRAM_MAX_VALUES 4
  
+// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
+#define COMMON_NGRAM_HASH_MAP_SIZE 262144
+
  // statistics of a m-gram after a known n-gram
  struct common_ngram_map_value {
-    size_t   value_idx = 0;  // index of value m-gram in token-history (0 if unused)
-    uint16_t value_num = 0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
+    size_t   value_idx =  0;  // index of value m-gram in token-history (0 if unused)
+    uint16_t value_num =  0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
      int16_t n_accepted = -1;  // number of accepted tokens at last draft (-1 if unused)
  };
  
@@ -74,23 +79,43 @@ struct common_ngram_map {
  
      bool key_only;       // true if only key n-grams are used, no values.
  
-    // first draft: vector only, no map.
      std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
      uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
      uint16_t min_hits;   // minimum number of key hits to consider a draft
  
+    bool     show_key_map_stats = false; // true, if statitics of the key_map should be printed.
+
      common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
                       uint16_t check_rate, uint16_t min_hits)
          : size_key(sz_key), size_value(sz_value), key_only(only_keys),
-          check_rate(check_rate), min_hits(min_hits) {}
+          check_rate(check_rate), min_hits(min_hits) {
+        key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
+    }
+
+    // In reasoning chats the previous reasoning block will be removed from context history.
+    // A rebuild of the ngram map is needed after that.
+
+    size_t   size_last_begin      = 0; // number of tokens at previous start of generation
  
      bool     last_draft_created   = false; // true if a draft was created at last call.
-    size_t   last_draft_key_idx   = 0; // index of last key used for draft generation.
+    size_t   last_draft_key_idx   = 0; // index of last key used for draft generation (0 = no draft)
      uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
  
      size_t   idx_last_check       = 0; // index of last check in context history
+
+    // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
+    //
+    // uint32_t instead of size_t (size of current histories is << UINT32_MAX)
+    std::vector<uint32_t> key_map;              // key_map[hash] = index of ngram in context window
+    uint32_t              key_map_last_idx = 0; // index of the last ngram added to key_map
  };
  
+// Initialize the n-gram map with the given token history.
+// map:                the ngram map to initialize.
+// tokens:             the token history to base the map on.
+void common_ngram_map_begin(
+    common_ngram_map & map,
+    const llama_tokens & tokens);
  
  // Searches for the n-gram in the history and checks whether a draft sequence should be generated.
  // map:                the ngram map to search in.
diff --git a/common/speculative.cpp b/common/speculative.cpp

index a1a3b51c13474b856c9eef308ec3489a8a1c0317..152aaa48d44b70d12f466a0537b7c6a6e5f59a02 100644 (file)
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -124,9 +124,9 @@ struct common_speculative_state {
      // TODO: track performance of most recent calls
      const bool gen_perf = true; // whether to generate performance stats.
  
-    // TODO: rename to t_draft_us
-    // TODO: add t_begin_us, t_accept_us
-    int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
+    int64_t t_begin_us  = 0; // total time spent in refresh of this implementation in microseconds.
+    int64_t t_draft_us  = 0; // total time spent in generating drafts in this implementation in microseconds.
+    int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
  
      common_speculative_state(enum common_speculative_type type) : type(type) {}
  
@@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
          : common_speculative_state(type), map(std::move(map)) {}
  
      void begin(const llama_tokens & prompt) override {
-        GGML_UNUSED(prompt);
+        common_ngram_map_begin(map, prompt);
      }
  
      void draft(
@@ -951,7 +951,12 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
      }
  
      for (auto & impl : spec->impls) {
+        const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
+
          impl->begin(prompt);
+
+        const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
+        impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh
      }
  }
  
@@ -973,7 +978,7 @@ llama_tokens common_speculative_draft(
              const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
  
              impl->drafts_call_count++;
-            impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation
+            impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation
          }
  
          if (!result.empty()) {
@@ -1001,12 +1006,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
  
      GGML_ASSERT(impl);
  
+    const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
      if (n_accepted > 0) {
          impl->drafts_accepted_count++;
          impl->drafts_accepted_tokens += n_accepted;
      }
  
      impl->accept(n_accepted);
+    const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
+    impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation
  }
  
  void common_speculative_print_stats(const common_speculative * spec) {
@@ -1018,13 +1026,14 @@ void common_speculative_print_stats(const common_speculative * spec) {
          std::string str_perf;
          if (impl->gen_perf) {
              std::ostringstream oss;
-            oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0;
-            str_perf = ", dur = " + oss.str() + " ms";
+            oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
+            oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
+            oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
+            str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
          } else {
              str_perf = "";
          }
  
-        // TODO: report time for begin() and accept()
          LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                  common_speculative_type_to_str(impl->type).c_str(),
                  impl->drafts_call_count,
diff --git a/docs/speculative.md b/docs/speculative.md

index 8281eaa2d34b81f31848f779093abad8abdee498..03afab5b41e1fc77ef69ef709d3515d4d915171c 100644 (file)
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce
  
  ## Implementations
  
-The `llama-server` application supports several implementations of speculative decoding:
+The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model.
  
  ### Draft Model (`draft`)
  
@@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM.
  
  This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
  
+```
+llama-server [...] --spec-type ngram-simple --draft-max 64
+```
+
  #### n-gram Map Key (`ngram-map-k`)
  
-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
  
  The number of accepted tokens is stored for each used n-gram.
  
+**Example:**
+```
+llama-server [...] --spec-type ngram-map-k --draft-max 64
+```
+
  #### n-gram Map Key-4-Values (`ngram-map-k4v`)
  
  This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
@@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called
  The number of accepted tokens is stored for each used n-gram.
  
  **Example:** Server options to be used if there are a lot of longer repetitions.
-```bash
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
  ```
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
+```
+
+### n-gram Mod (`ngram-mod`)
+
+Add basic ngram hasher for speculative decoding:
+
+- For each ngram, compute a hash using LCG
+- For each computed hash, store the next token
+- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage
+
+Some characteristics:
+
+- Lightweight (~16 MB)
+- Constant memory and complexity
+- Can generate variable draft lengths (i.e. m is not fixed)
+
+Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other.
  
+**Sample usage:**
+
+```
+# notes:
+# - small `n` are not recommended
+# - MoEs require long drafts
+# - dense models: can reduce `--draft-min` and `--draft-max`
+
+llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+```
+
+Applications:
+
+- Iterating over a block of text/code (e.g. in llama.vim)
+- Reasoning models (when they have to repeat their thinking in the final answer)
+- Summarization
+
+Example Video:
+
+- See #19164
+
+### Differences between ngram-simple, ngram-map and ngram-mod
+
+- ngram-simple looks for a previous matching n-gram and inserts the following m-gram.
+- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window.
+- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map).
  
  ## Command-Line Options
  
  If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
  
  ```
---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
+--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_DRAFT_MAX)
+--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
+                                        (default: 0)
+                                        (env: LLAMA_ARG_DRAFT_MIN)
+[...]
+--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                          type of speculative decoding to use when no draft model is provided
                                          (default: none)
  --spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
@@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model.
  | `ngram-simple` | Use simple n-gram pattern matching |
  | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
  | `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
+| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
  
  **Example:** Server-instance used to refactor source code.
  ```bash
@@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok
  statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
  ```
  
+```
+draft acceptance rate = 0.70312 (   90 accepted /   128 generated)
+statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
+```
+
  - `#calls`: number of calls of this implementations
  - `#gen drafts`: number of drafts generated by this implementation
  - `#acc drafts`: number of drafts accepted (partially) by the main model
  - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
  - `#acc tokens`: number of tokens accepted by the main model
+- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
author	Sascha Rogmann <redacted>
	Mon, 2 Feb 2026 06:26:58 +0000 (07:26 +0100)
committer	GitHub <redacted>
	Mon, 2 Feb 2026 06:26:58 +0000 (08:26 +0200)
common/ngram-map.cpp		patch \| blob \| history
common/ngram-map.h		patch \| blob \| history
common/speculative.cpp		patch \| blob \| history
docs/speculative.md		patch \| blob \| history