Some improvements to loading the session with --prompt-cache (#1550)

author Kerfuffle <redacted>

Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)

committer GitHub <redacted>

Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)
author Kerfuffle <redacted>
Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)
committer GitHub <redacted>
Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)
diff --git a/examples/main/README.md b/examples/main/README.md

index 7c03f92c897d9c7577fd01c1b1e8d4cc25cd8633..e71ba617366a99081c22906089cf5bf62aa2c02d 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
  
  ### Prompt Caching
  
--   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
  
  ### Quantization
  
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 47b418d972bbca6cc00ae5d5f292233634180a93..c7c591537419c5631dde017958edac71799852b6 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
          return 0;
      }
  
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
  
      std::string path_session = params.path_prompt_cache;
      std::vector<llama_token> session_tokens;
@@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
                  return 1;
              }
              session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
  
              fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
          } else {
@@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
      }
  
      // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }
  
      const int n_ctx = llama_n_ctx(ctx);
  
@@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
              }
              n_matching_session_tokens++;
          }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
              fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
          } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
              fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
author	Kerfuffle <redacted>
	Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)
committer	GitHub <redacted>
	Fri, 26 May 2023 02:18:01 +0000 (20:18 -0600)
examples/main/README.md		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history