From: Diego Devesa Date: Mon, 27 Oct 2025 20:51:28 +0000 (-0700) Subject: llama : disable pipeline parallelism if compute buffer allocation fails (#16748) X-Git-Tag: upstream/0.0.7011~154 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=5a4ff43e7dd049e35942bc3d12361dab2f155544;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : disable pipeline parallelism if compute buffer allocation fails (#16748) --- diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bd348bca..f6192a36 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -268,9 +268,7 @@ llama_context::llama_context( if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); } - } - if (!hparams.vocab_only) { llama_memory_context_ptr mctx; if (memory) { LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__); @@ -343,7 +341,14 @@ llama_context::llama_context( { auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { - throw std::runtime_error("failed to allocate compute pp buffers"); + if (pipeline_parallel) { + LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload)); + gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + } + if (!gf) { + throw std::runtime_error("failed to allocate compute pp buffers"); + } } n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());