perplexity : fix kv cache handling for hellaswag (#4981)

author Georgi Gerganov <redacted>

Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)

committer GitHub <redacted>

Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)
author Georgi Gerganov <redacted>
Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)
committer GitHub <redacted>
Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp

index 9a77beca6df323870335f77b102aa73590fbb9f3..b4fedf8039c7851279e2d36ef03a8e91f0244815 100644 (file)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
      for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
          size_t n_tokens = tokens.size() - i_chunk * n_batch;
          n_tokens = std::min(n_tokens, size_t(n_batch));
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
          if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
              fprintf(stderr, "%s : failed to eval\n", __func__);
              return {};
author	Georgi Gerganov <redacted>
	Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)
committer	GitHub <redacted>
	Tue, 16 Jan 2024 17:34:54 +0000 (19:34 +0200)