if (cparams.pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
-
- if (!graph_reuse_disable) {
- // TODO: figure out a way to make graph reuse work with pipeline parallelism
- // ref: https://github.com/ggml-org/llama.cpp/pull/20463
- LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
-
- graph_reuse_disable = true;
- }
}
sched_reserve();
if (!graph_reuse_disable && res->can_reuse(gparams)) {
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+ // with pipeline parallelism, the previous graph_compute_async may still be running
+ // on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
+ // that the previous compute is still reading.
+ if (cparams.pipeline_parallel) {
+ ggml_backend_sched_synchronize(sched.get());
+ }
+
n_reused++;
} else {
res->reset();