context : do not reserve scheduler for warmups (#18867)

author Georgi Gerganov <redacted>

Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)

committer GitHub <redacted>

Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)
author Georgi Gerganov <redacted>
Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)
committer GitHub <redacted>
Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 12e40018bb9a2ca5db3fb9d6558eda3b573288b6..a6d5ddfa3300ff15a666ad5e4a9ff8779607db75 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -337,7 +337,7 @@ llama_context::llama_context(
          cparams.pipeline_parallel = pipeline_parallel;
  
          if (cparams.pipeline_parallel) {
-            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+            LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
          }
  
          sched_reserve();
@@ -537,7 +537,8 @@ void llama_context::sched_reserve() {
  
      const int64_t t_end_us = ggml_time_us();
  
-    LLAMA_LOG_INFO("%s: reserve took %.2f ms\n", __func__, (t_end_us - t_start_us)/1000.0);
+    LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
+            __func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
  }
  
  void llama_context::synchronize() {
@@ -1011,7 +1012,8 @@ void llama_context::set_warmup(bool value) {
  
      cparams.warmup = value;
  
-    sched_need_reserve = true;
+    // warmups are usually with small batches, so no need to reserve
+    //sched_need_reserve = true;
  }
  
  bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
author	Georgi Gerganov <redacted>
	Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)
committer	GitHub <redacted>
	Thu, 15 Jan 2026 17:35:57 +0000 (19:35 +0200)