From: Diego Devesa <redacted>
Date: Mon, 27 Oct 2025 20:51:28 +0000 (-0700)
Subject: llama : disable pipeline parallelism if compute buffer allocation fails (#16748)
X-Git-Tag: upstream/0.0.7011~154
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=5a4ff43e7dd049e35942bc3d12361dab2f155544;p=pkg%2Fggml%2Fsources%2Fllama.cpp

llama : disable pipeline parallelism if compute buffer allocation fails (#16748)
---

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bd348bcad..f6192a36e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -268,9 +268,7 @@ llama_context::llama_context(
         if (pipeline_parallel) {
             LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
         }
-    }
 
-    if (!hparams.vocab_only) {
         llama_memory_context_ptr mctx;
         if (memory) {
             LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +341,14 @@ llama_context::llama_context(
         {
             auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
             if (!gf) {
-                throw std::runtime_error("failed to allocate compute pp buffers");
+                if (pipeline_parallel) {
+                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                }
+                if (!gf) {
+                    throw std::runtime_error("failed to allocate compute pp buffers");
+                }
             }
 
             n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());