common : avoid unnecessary logits fetch (#8358)

author Kevin Wang <redacted>

Mon, 8 Jul 2024 06:31:55 +0000 (02:31 -0400)

committer GitHub <redacted>

Mon, 8 Jul 2024 06:31:55 +0000 (09:31 +0300)
author Kevin Wang <redacted>
Mon, 8 Jul 2024 06:31:55 +0000 (02:31 -0400)
committer GitHub <redacted>
Mon, 8 Jul 2024 06:31:55 +0000 (09:31 +0300)
diff --git a/common/sampling.cpp b/common/sampling.cpp

index 9f332fe5736838d58da96edfa8a39fb2ac4a2329..e9db2715f2649e0926a86cb809c93d1a6fafe3e0 100644 (file)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
          GGML_ASSERT(!original_logits.empty());
      }
      llama_token id = 0;
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
  
      if (temp < 0.0) {
          // greedy sampling, with probs
@@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
      }
  
      if (ctx_sampling->grammar != NULL && !is_resampling) {
+        // Get a pointer to the logits
+        float * logits = llama_get_logits_ith(ctx_main, idx);
+
          // Create an array with a single token data element for the sampled id
          llama_token_data single_token_data = {id, logits[id], 0.0f};
          llama_token_data_array single_token_data_array = { &single_token_data, 1, false };