ggml-backend: re-enable graph reuse with pipeline parallelism (#20927)

author Aman Gupta <redacted>

Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)

committer GitHub <redacted>

Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)
author Aman Gupta <redacted>
Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)
committer GitHub <redacted>
Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 6aa73630c9cf125de8de4dd63b654303a892d7ed..f6ce2817a85d0c0c222272431f9bf44ba1c8ec4c 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -342,14 +342,6 @@ llama_context::llama_context(
  
          if (cparams.pipeline_parallel) {
              LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
-
-            if (!graph_reuse_disable) {
-                // TODO: figure out a way to make graph reuse work with pipeline parallelism
-                // ref: https://github.com/ggml-org/llama.cpp/pull/20463
-                LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
-
-                graph_reuse_disable = true;
-            }
          }
  
          sched_reserve();
@@ -1189,6 +1181,13 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
      if (!graph_reuse_disable && res->can_reuse(gparams)) {
          //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
  
+        // with pipeline parallelism, the previous graph_compute_async may still be running
+        // on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
+        // that the previous compute is still reading.
+        if (cparams.pipeline_parallel) {
+            ggml_backend_sched_synchronize(sched.get());
+        }
+
          n_reused++;
      } else {
          res->reset();
author	Aman Gupta <redacted>
	Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)
committer	GitHub <redacted>
	Tue, 24 Mar 2026 12:47:00 +0000 (20:47 +0800)