speculative : ensure draft and target model vocab matches (#3812)

author Kerfuffle <redacted>

Fri, 27 Oct 2023 21:40:07 +0000 (15:40 -0600)

committer GitHub <redacted>

Fri, 27 Oct 2023 21:40:07 +0000 (00:40 +0300)
author Kerfuffle <redacted>
Fri, 27 Oct 2023 21:40:07 +0000 (15:40 -0600)
committer GitHub <redacted>
Fri, 27 Oct 2023 21:40:07 +0000 (00:40 +0300)
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp

index 92ad27e8e423ccf97e7f1258bab8fe26932c770b..f921b78455a72c4be7cc1ad86da78217f8132447 100644 (file)
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -8,6 +8,9 @@
  #include <string>
  #include <vector>
  
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+
  struct seq_draft {
      bool active   = false;
      bool drafting = false;
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
      params.n_gpu_layers = params.n_gpu_layers_draft;
      std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
  
+    {
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);
+        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
+            ? n_vocab_tgt - n_vocab_dft
+            : n_vocab_dft - n_vocab_tgt;
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
+            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return 1;
+        }
+
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
+                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+                        llama_token_to_piece(ctx_tgt, i).c_str(),
+                        llama_token_to_piece(ctx_dft, i).c_str());
+                return 1;
+            }
+        }
+    }
+
      // tokenize the prompt
      std::vector<llama_token> inp;
      inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
              llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
  
              llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
              llama_decode         (ctx_dft, batch_dft);
  
              ++n_past_dft;
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
                  llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
              }
  
-            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
              llama_decode(ctx_tgt, batch_tgt);
              ++n_past_tgt;
          }
author	Kerfuffle <redacted>
	Fri, 27 Oct 2023 21:40:07 +0000 (15:40 -0600)
committer	GitHub <redacted>
	Fri, 27 Oct 2023 21:40:07 +0000 (00:40 +0300)