llama : do a warm-up eval at start for better timings (#1824)

author Georgi Gerganov <redacted>

Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)

committer GitHub <redacted>

Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)
author Georgi Gerganov <redacted>
Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)
committer GitHub <redacted>
Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 66d563143a5c624c7a2222f4fccd865148e0ae76..efa913e165f6cfc11eaf2d5c4d5ff18e77db56f7 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -331,6 +331,13 @@ int main(int argc, char ** argv) {
  
      std::vector<llama_token> embd;
  
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+
      while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
          // predict
          if (embd.size() > 0) {
author	Georgi Gerganov <redacted>
	Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)
committer	GitHub <redacted>
	Tue, 13 Jun 2023 17:20:07 +0000 (20:20 +0300)