]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
llama : disable pipeline parallelism if compute buffer allocation fails (#16748)
authorDiego Devesa <redacted>
Mon, 27 Oct 2025 20:51:28 +0000 (13:51 -0700)
committerGitHub <redacted>
Mon, 27 Oct 2025 20:51:28 +0000 (21:51 +0100)
src/llama-context.cpp

index bd348bcad370a8ca5c53e3a6c576d278b6b1dba1..f6192a36e0ee5db58fd2e2399a972b8b0a2537fd 100644 (file)
@@ -268,9 +268,7 @@ llama_context::llama_context(
         if (pipeline_parallel) {
             LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
         }
-    }
 
-    if (!hparams.vocab_only) {
         llama_memory_context_ptr mctx;
         if (memory) {
             LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +341,14 @@ llama_context::llama_context(
         {
             auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
             if (!gf) {
-                throw std::runtime_error("failed to allocate compute pp buffers");
+                if (pipeline_parallel) {
+                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                }
+                if (!gf) {
+                    throw std::runtime_error("failed to allocate compute pp buffers");
+                }
             }
 
             n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());