context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)

author Georgi Gerganov <redacted>

Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)

committer GitHub <redacted>

Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)
author Georgi Gerganov <redacted>
Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)
committer GitHub <redacted>
Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index a91d157e298032c914a7678dc06b67f4d7fc7b23..84f9ccab4ec2f846a350b0cfa99732e507ce3b44 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -105,7 +105,7 @@ llama_context::llama_context(
  
      {
          const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
  
          if (!supports_set_rows && !cparams.kv_unified) {
              LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
          }
      }
  
+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
+
      // TODO: hacky solution
      if (model.arch == LLM_ARCH_T5 && t_embd) {
          //cross.t_embd = t_embd;
@@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
      // wait for the computation to finish (automatically done when obtaining the model output)
      //synchronize();
  
+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
+
      return 0;
  }
  
diff --git a/src/llama-context.h b/src/llama-context.h

index fdbe61207e8ce49535ff1c2119cb7207d5bbcb93..5c3a1c09886ea29178b9427f43abc3c085a7e9f7 100644 (file)
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -287,6 +287,10 @@ private:
  
      bool has_evaluated_once = false;
  
+    // env: LLAMA_SET_ROWS (temporary)
+    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
+    bool supports_set_rows = false;
+
      // perf
      mutable int64_t t_start_us  = 0;
      mutable int64_t t_load_us   = 0;
author	Georgi Gerganov <redacted>
	Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)
committer	GitHub <redacted>
	Fri, 25 Jul 2025 11:28:06 +0000 (14:28 +0300)
src/llama-context.cpp		patch \| blob \| history
src/llama-context.h		patch \| blob \| history