// pipeline parallelism support
int n_copies;
int cur_copy;
+ int next_copy;
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
int n_graph_inputs;
}
}
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
-
return GGML_STATUS_SUCCESS;
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
- ggml_backend_sched_split_graph(sched, measure_graph);
-
ggml_backend_sched_synchronize(sched);
+ ggml_backend_sched_split_graph(sched, measure_graph);
+
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false;
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+ GGML_ASSERT(!sched->is_alloc);
+
+ sched->cur_copy = sched->next_copy;
+ sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
ggml_backend_sched_split_graph(sched, graph);
// if the graph is not already allocated, always use copy 0 after a synchronization
// this ensures that during generation the same copy is used every time,
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
- sched->cur_copy = 0;
+ sched->next_copy = 0;
}
}