common : preallocate sampling token data vector (#8363)

author Kevin Wang <redacted>

Mon, 8 Jul 2024 07:26:53 +0000 (03:26 -0400)

committer GitHub <redacted>

Mon, 8 Jul 2024 07:26:53 +0000 (10:26 +0300)
author Kevin Wang <redacted>
Mon, 8 Jul 2024 07:26:53 +0000 (03:26 -0400)
committer GitHub <redacted>
Mon, 8 Jul 2024 07:26:53 +0000 (10:26 +0300)
diff --git a/common/sampling.cpp b/common/sampling.cpp

index e9db2715f2649e0926a86cb809c93d1a6fafe3e0..6a483c8154574085eeff6b0f59c419c7f3de619c 100644 (file)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
      if (ctx_sampling->grammar != NULL && !apply_grammar) {
          GGML_ASSERT(original_logits != NULL);
          // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
      }
  
      // apply params.logit_bias map
@@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
          llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
      }
  
-    cur.clear();
+    cur.resize(n_vocab);
  
      for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
      }
  
      llama_token_data_array cur_p = { cur.data(), cur.size(), false };