sched : fix multiple evaluations of the same graph with pipeline parallelism (llama...

author Diego Devesa <redacted>

Fri, 25 Jul 2025 08:07:26 +0000 (01:07 -0700)

committer Georgi Gerganov <redacted>

Mon, 28 Jul 2025 05:43:21 +0000 (08:43 +0300)
author Diego Devesa <redacted>
Fri, 25 Jul 2025 08:07:26 +0000 (01:07 -0700)
committer Georgi Gerganov <redacted>
Mon, 28 Jul 2025 05:43:21 +0000 (08:43 +0300)
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp

index b7498b8d40238f44ed6bbbfc1134b5f5663b10ca..eaf41e5a6c84d23ff37df5676ec1502f5d56c357 100644 (file)
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
      // pipeline parallelism support
      int n_copies;
      int cur_copy;
+    int next_copy;
      ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
      struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
      int n_graph_inputs;
@@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
          }
      }
  
-    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
-
      return GGML_STATUS_SUCCESS;
  }
  
@@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
      GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
  
-    ggml_backend_sched_split_graph(sched, measure_graph);
-
      ggml_backend_sched_synchronize(sched);
  
+    ggml_backend_sched_split_graph(sched, measure_graph);
+
      if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
          return false;
      }
@@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
  
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
      GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+    GGML_ASSERT(!sched->is_alloc);
+
+    sched->cur_copy = sched->next_copy;
+    sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
  
      ggml_backend_sched_split_graph(sched, graph);
  
@@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
          // if the graph is not already allocated, always use copy 0 after a synchronization
          // this ensures that during generation the same copy is used every time,
          // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
-        sched->cur_copy = 0;
+        sched->next_copy = 0;
      }
  }
author	Diego Devesa <redacted>
	Fri, 25 Jul 2025 08:07:26 +0000 (01:07 -0700)
committer	Georgi Gerganov <redacted>
	Mon, 28 Jul 2025 05:43:21 +0000 (08:43 +0300)