spec : add ngram-mod (#19164)

author Georgi Gerganov <redacted>

Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)

committer GitHub <redacted>

Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)
author Georgi Gerganov <redacted>
Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)
committer GitHub <redacted>
Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt

index 3bc7bc6210bc9b025af9744a1624fe9e833223e5..295ae9ea25469c3e1231085891bf31370276892d 100644 (file)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
      ngram-cache.h
      ngram-map.cpp
      ngram-map.h
+    ngram-mod.cpp
+    ngram-mod.h
      peg-parser.cpp
      peg-parser.h
      preset.cpp
diff --git a/common/arg.cpp b/common/arg.cpp

index 218418f07010455af6b7d910bb776bc6bb1756b3..5fbc9022c02934a6d033508ec614fbb891729b7d 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
      add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
+        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
          string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
              common_speculative_type_to_str(params.speculative.type).c_str()),
          [](common_params & params, const std::string & value) {
@@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                  params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
              } else if (value == "ngram-map-k4v") {
                  params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
+            } else if (value == "ngram-mod") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
              } else {
                  throw std::invalid_argument("unknown speculative decoding type without draft model");
              }
diff --git a/common/common.h b/common/common.h

index fd3ab8cd18018f6f98bffd17c8eea22d505bcef7..398ebb09601c3745543bc98c72ee8da8f5db05d1 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -171,6 +171,7 @@ enum common_speculative_type {
      COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
      COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
      COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
      COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
      COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
  };
@@ -252,6 +253,8 @@ struct common_params_model {
      std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
  };
  
+struct common_ngram_mod;
+
  struct common_params_speculative {
      common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
  
@@ -269,6 +272,8 @@ struct common_params_speculative {
      uint16_t ngram_check_rate =  1; // check rate for ngram lookup
      uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
  
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
      std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
      std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
  
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp

index 930e7a3c10fa4f170ff00393304b631f8ef7c0db..84fd761367d37475481b665c76031f13ef397b41 100644 (file)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -7,6 +7,21 @@
  #include <cstdio>
  #include <sstream>
  
+// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
+static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
+    std::ostringstream oss;
+    oss << '[';
+    for (size_t i = 0; i < length; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << inp[start + i];
+    }
+    oss << ']';
+    return oss.str();
+}
+
+
  // n-gram simple
  //
  
@@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
  // maximum number of counted values of a ngram map value.
  #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
  
-static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
-
  void common_ngram_map_draft(common_ngram_map & map,
          const llama_tokens & inp, llama_token sampled,
          llama_tokens & draft) {
@@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
              n_accepted, curr_value.n_accepted);
      curr_value.n_accepted = n_accepted;
  }
-
-// Helper functions.
-//
-
-// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
-std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
-    std::ostringstream oss;
-    oss << '[';
-    for (size_t i = 0; i < length; ++i) {
-        if (i > 0) {
-            oss << ", ";
-        }
-        oss << inp[start + i];
-    }
-    oss << ']';
-    return oss.str();
-}
-
diff --git a/common/ngram-map.h b/common/ngram-map.h

index bf91883f0c3e67fe25eaf7d15109656b6ab8fe4f..b365034ac51afb5c7fc252121ef493240a8ff5ce 100644 (file)
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -11,6 +11,7 @@
  //
  
  #include "llama.h"
+#include "common.h"
  
  #include <vector>
  
diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp

new file mode 100644 (file)

index 0000000..76f7257
--- /dev/null
+++ b/common/ngram-mod.cpp
@@ -0,0 +1,60 @@
+#include "ngram-mod.h"
+
+//
+// common_ngram_mod
+//
+
+common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
+    entries.resize(size);
+
+    reset();
+}
+
+size_t common_ngram_mod::idx(const entry_t * tokens) const {
+    size_t res = 0;
+
+    for (size_t i = 0; i < n; ++i) {
+        res = res*6364136223846793005ULL + tokens[i];
+    }
+
+    res = res % entries.size();
+
+    return res;
+}
+
+void common_ngram_mod::add(const entry_t * tokens) {
+    const size_t i = idx(tokens);
+
+    if (entries[i] == EMPTY) {
+        used++;
+    }
+
+    entries[i] = tokens[n];
+}
+
+common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
+    const size_t i = idx(tokens);
+
+    return entries[i];
+}
+
+void common_ngram_mod::reset() {
+    std::fill(entries.begin(), entries.end(), EMPTY);
+    used = 0;
+}
+
+size_t common_ngram_mod::get_n() const {
+    return n;
+}
+
+size_t common_ngram_mod::get_used() const {
+    return used;
+}
+
+size_t common_ngram_mod::size() const {
+    return entries.size();
+}
+
+size_t common_ngram_mod::size_bytes() const {
+    return entries.size() * sizeof(entries[0]);
+}
diff --git a/common/ngram-mod.h b/common/ngram-mod.h

new file mode 100644 (file)

index 0000000..cf3c89c
--- /dev/null
+++ b/common/ngram-mod.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+//
+// common_ngram_mod
+// ref: https://github.com/ggml-org/llama.cpp/pull/19164
+//
+
+// basic n-gram hasher
+struct common_ngram_mod {
+    using entry_t = int32_t;
+
+    static constexpr entry_t EMPTY = -1;
+
+    common_ngram_mod(uint16_t n, size_t size);
+
+    size_t  idx(const entry_t * tokens) const;
+    void    add(const entry_t * tokens);
+    entry_t get(const entry_t * tokens) const; // return -1 if not found
+
+    void reset();
+
+    size_t get_n()    const;
+    size_t get_used() const;
+
+    size_t size()       const;
+    size_t size_bytes() const;
+
+private:
+    size_t n; // ngram size to hash
+
+    size_t used;
+
+    std::vector<entry_t> entries;
+};
diff --git a/common/speculative.cpp b/common/speculative.cpp

index 3f314b5d57865d43ddb5eb9de545cb0f637fd293..a1a3b51c13474b856c9eef308ec3489a8a1c0317 100644 (file)
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -6,6 +6,7 @@
  #include "log.h"
  #include "ngram-cache.h"
  #include "ngram-map.h"
+#include "ngram-mod.h"
  #include "sampling.h"
  
  #include <algorithm>
@@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
      COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
      COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
      COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
      COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
  };
  
@@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
      {"ngram_simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
      {"ngram_map_k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
      {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
+    {"ngram_mod",     COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
      {"ngram_cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
  };
  
@@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
  struct common_speculative_state {
      const enum common_speculative_type type;
  
+    // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
+    // TODO: add n_call_begin, n_call_accept
      size_t drafts_call_count       = 0; // number of times this implementation was called.
      size_t drafts_generated_count  = 0; // number of times a draft or part was generated by this implementation.
      size_t drafts_accepted_count   = 0; // number of times a draft or part was accepted by the target model.
@@ -119,6 +124,8 @@ struct common_speculative_state {
      // TODO: track performance of most recent calls
      const bool gen_perf = true; // whether to generate performance stats.
  
+    // TODO: rename to t_draft_us
+    // TODO: add t_begin_us, t_accept_us
      int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
  
      common_speculative_state(enum common_speculative_type type) : type(type) {}
@@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
      }
  };
  
+struct common_speculative_state_ngram_mod : public common_speculative_state {
+    common_ngram_mod & mod;
+
+    // the last position in the prompt that was added to the ngram container
+    size_t i_last = 0;
+
+    // length of the last drafted n‑gram (number of tokens returned by draft)
+    size_t n_draft_last = 0;
+
+    // consecutive accept rounds with low acceptance fraction (< 0.5)
+    int n_low = 0;
+
+    // enable trace logging if LLAMA_TRACE is set
+    const bool verbose;
+
+    common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
+        : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
+        static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
+    }
+
+    void begin(const llama_tokens & prompt) override {
+        i_last = 0;
+
+        n_draft_last = 0;
+
+        const size_t n = mod.get_n();
+
+        if (prompt.size() < n) {
+            return;
+        }
+
+        for (size_t i = 0; i < prompt.size() - n; ++i) {
+            mod.add(prompt.data() + i);
+        }
+
+        i_last = prompt.size() - n;
+
+        const double f = (double)mod.get_used() / (double)mod.size();
+        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
+
+        constexpr double f_thold = 0.25;
+        if (f > f_thold) {
+            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
+
+            mod.reset();
+        }
+    }
+
+    void draft(
+            const common_params_speculative & params,
+            const llama_tokens & prompt_tgt,
+            llama_token id_last,
+            llama_tokens & result) override {
+        GGML_UNUSED(params);
+
+        n_draft_last = 0;
+
+        const size_t cur_len = prompt_tgt.size();
+        if (cur_len < mod.get_n()) {
+            return;
+        }
+
+        const size_t n = mod.get_n();
+
+        // add new ngrams in chunks
+        if (i_last + 32 < cur_len) {
+            for (size_t i = i_last; i < cur_len - n; ++i) {
+                mod.add(prompt_tgt.data() + i);
+            }
+
+            i_last = cur_len - n;
+        }
+
+        result.resize(n + params.n_max);
+        for (size_t i = 0; i < n - 1; ++i) {
+            result[i] = prompt_tgt[cur_len - n + 1 + i];
+        }
+        result[n - 1] = id_last;
+
+        for (int i = 0; i < params.n_max; ++i) {
+            const llama_token token = mod.get(result.data() + i);
+            if (token == common_ngram_mod::EMPTY) {
+                if (i < params.n_min) {
+                    result.clear();
+                    return;
+                }
+
+                result.resize(n + i);
+                break;
+            }
+            result[n + i] = token;
+        }
+
+        // only return the m tokens that were drafted
+        for (size_t i = 0; n + i < result.size(); ++i) {
+            result[i] = result[n + i];
+        }
+        result.resize(result.size() - n);
+
+        // store length of drafted n‑gram for later acceptance analysis
+        n_draft_last = result.size();
+    }
+
+    void accept(uint16_t n_accepted) override {
+        if (verbose) {
+            LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
+        }
+
+        // compute acceptance fraction if we have a recorded draft length
+        if (n_draft_last > 0) {
+            const double f_acc = (double)n_accepted / (double)n_draft_last;
+            if (f_acc < 0.5) {
+                n_low++;
+                if (n_low >= 3) {
+                    LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+
+                    mod.reset();
+                    n_low = 0;
+                }
+            } else {
+                n_low = 0;
+            }
+        }
+    }
+};
+
  struct common_speculative_state_ngram_cache : public common_speculative_state {
      uint16_t n_draft;
      bool save_dynamic;
@@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
          case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram_simple";
          case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram_map_k";
          case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
+        case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:     return "ngram_mod";
          case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:   return "ngram_cache";
          default:                                    return "unknown";
      }
@@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
  // initialization of the speculative decoding system
  //
  common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt) {
+        common_params_speculative & params,
+        llama_context             * ctx_tgt) {
      llama_context * ctx_dft = nullptr;
      if (params.model_dft) {
          ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
@@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
          bool has_ngram_simple  = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
          bool has_ngram_map_k   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
          bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+        bool has_ngram_mod     = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
  
          // In a more complex implementation we could use the same implementation but with different parameters.
          // This was initially used in PR-18471 but removed to simplify the code.
@@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
              // This implementation can guess tokens with high acceptance rate but is more expensive.
              configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
          }
+        if (has_ngram_mod) {
+            // shared instance for all speculative decoding contexts
+            if (!params.ngram_mod) {
+                params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
+
+                LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
+                        params.ngram_size_n, params.ngram_mod->size(),
+                        (float)(params.ngram_mod->size_bytes())/1024/1024);
+
+                if (params.ngram_size_n < 16) {
+                    LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
+                }
+            }
+
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
+        }
          if (has_ngram_cache) {
              configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
          }
@@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
                  ));
                  break;
              }
+            case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
+                GGML_ASSERT(config.params.ngram_mod);
+                impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
+                break;
+            }
              case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
                  auto state = create_state_ngram_cache(
                          params.lookup_cache_static, params.lookup_cache_dynamic, config);
@@ -822,8 +978,7 @@ llama_tokens common_speculative_draft(
  
          if (!result.empty()) {
              LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
-                    common_speculative_type_to_str(impl.get()->type).c_str(),
-                    prompt_tgt.size(),
+                    common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
                      impl.get()->drafts_call_count, result.size());
  
              spec->curr_impl = impl.get(); // set current implementation for stats
@@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
              str_perf = "";
          }
  
+        // TODO: report time for begin() and accept()
          LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                  common_speculative_type_to_str(impl->type).c_str(),
                  impl->drafts_call_count,
diff --git a/common/speculative.h b/common/speculative.h

index 9e1888e4be0b3aec53c3ebf5ccdf5ea7fee0c1b3..76fe6bb7bca39a8f7074c8449477dcdcea70cd31 100644 (file)
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
  std::string common_speculative_type_to_str(enum common_speculative_type type);
  
  common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt);
+        common_params_speculative & params,
+        llama_context             * ctx_tgt);
  
  void common_speculative_free(common_speculative * spec);
  
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp

index 1ca4e3cc0e9a48c893477ebe2b995c27bc7cb6e6..6f26fc9a9b2e9df799b98bd0ae2d457e9378adcf 100644 (file)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -705,6 +705,11 @@ private:
                  params_base.n_cache_reuse = 0;
                  SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
              }
+
+            if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
+                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
+                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+            }
          }
  
          if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@@ -754,9 +759,9 @@ private:
                          SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
                          return false;
                      }
-                    SRV_WRN("%s", "speculative decoding context initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
                  } else {
-                    SRV_WRN("%s", "speculative decoding context not initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                  }
              }
author	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)
committer	GitHub <redacted>
	Fri, 30 Jan 2026 16:21:48 +0000 (18:21 +0200)
common/CMakeLists.txt		patch \| blob \| history
common/arg.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
common/ngram-map.cpp		patch \| blob \| history
common/ngram-map.h		patch \| blob \| history
common/ngram-mod.cpp	[new file with mode: 0644]	patch \| blob
common/ngram-mod.h	[new file with mode: 0644]	patch \| blob
common/speculative.cpp		patch \| blob \| history
common/speculative.h		patch \| blob \| history
tools/server/server-context.cpp		patch \| blob \| history