server : fix kv cache management (#3588)

author Georgi Gerganov <redacted>

Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)

committer GitHub <redacted>

Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)
author Georgi Gerganov <redacted>
Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)
committer GitHub <redacted>
Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index d992feeef7026ece01e2484db463c8586ce38dc2..ee0ababb1d5cee93bb2ce40b709f40da70cbec6d 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ struct llama_server_context
          // compare the evaluated prompt with the new prompt
          n_past = common_part(embd, prompt_tokens);
          embd = prompt_tokens;
+
          if (n_past == num_prompt_tokens)
          {
              // we have to evaluate at least 1 token to generate logits.
@@ -412,6 +413,9 @@ struct llama_server_context
              n_past--;
          }
  
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
          LOG_VERBOSE("prompt ingested", {
                                             {"n_past", n_past},
                                             {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -461,9 +465,6 @@ struct llama_server_context
          // compare the evaluated prompt with the new prompt
          n_past = common_part(embd, prompt_tokens);
  
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
          embd = prompt_tokens;
          if (n_past == num_prompt_tokens)
          {
@@ -471,6 +472,9 @@ struct llama_server_context
              n_past--;
          }
  
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
          LOG_VERBOSE("prompt ingested", {
                                             {"n_past", n_past},
                                             {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
author	Georgi Gerganov <redacted>
	Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)
committer	GitHub <redacted>
	Thu, 12 Oct 2023 06:29:04 +0000 (09:29 +0300)