llama : disable pipeline parallelism if compute buffer allocation fails (#16748)

author Diego Devesa <redacted>

Mon, 27 Oct 2025 20:51:28 +0000 (13:51 -0700)

committer GitHub <redacted>

Mon, 27 Oct 2025 20:51:28 +0000 (21:51 +0100)
author Diego Devesa <redacted>
Mon, 27 Oct 2025 20:51:28 +0000 (13:51 -0700)
committer GitHub <redacted>
Mon, 27 Oct 2025 20:51:28 +0000 (21:51 +0100)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index bd348bcad370a8ca5c53e3a6c576d278b6b1dba1..f6192a36e0ee5db58fd2e2399a972b8b0a2537fd 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -268,9 +268,7 @@ llama_context::llama_context(
          if (pipeline_parallel) {
              LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
          }
-    }
  
-    if (!hparams.vocab_only) {
          llama_memory_context_ptr mctx;
          if (memory) {
              LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +341,14 @@ llama_context::llama_context(
          {
              auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
              if (!gf) {
-                throw std::runtime_error("failed to allocate compute pp buffers");
+                if (pipeline_parallel) {
+                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                }
+                if (!gf) {
+                    throw std::runtime_error("failed to allocate compute pp buffers");
+                }
              }
  
              n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
author	Diego Devesa <redacted>
	Mon, 27 Oct 2025 20:51:28 +0000 (13:51 -0700)
committer	GitHub <redacted>
	Mon, 27 Oct 2025 20:51:28 +0000 (21:51 +0100)