]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
vulkan: sort graph to allow more parallel execution (#15850)
authorJeff Bolz <redacted>
Mon, 8 Sep 2025 18:10:07 +0000 (13:10 -0500)
committerGitHub <redacted>
Mon, 8 Sep 2025 18:10:07 +0000 (02:10 +0800)
* vulkan: sort graph to allow more parallel execution

Add a backend proc to allow the backend to modify the graph. The
vulkan implementation looks at which nodes depend on each other
and greedily reorders them to group together nodes that don't
depend on each other. It only reorders the nodes, doesn't change
the contents of any of them.

With #15489, this reduces the number of synchronizations needed.

* call optimize_graph per-split

13 files changed:
ggml/src/ggml-backend-impl.h
ggml/src/ggml-backend.cpp
ggml/src/ggml-blas/ggml-blas.cpp
ggml/src/ggml-cann/ggml-cann.cpp
ggml/src/ggml-cpu/ggml-cpu.cpp
ggml/src/ggml-cuda/ggml-cuda.cu
ggml/src/ggml-metal/ggml-metal.m
ggml/src/ggml-opencl/ggml-opencl.cpp
ggml/src/ggml-rpc/ggml-rpc.cpp
ggml/src/ggml-sycl/ggml-sycl.cpp
ggml/src/ggml-vulkan/ggml-vulkan.cpp
ggml/src/ggml-webgpu/ggml-webgpu.cpp
ggml/src/ggml-zdnn/ggml-zdnn.cpp

index c36c12d6579ac45df41cf44fa1e663204eb134c9..2db5c4e0f85e5934ade50fe891a5b450084d5774 100644 (file)
@@ -114,6 +114,9 @@ extern "C" {
         void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
         // wait for an event on on a different stream
         void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+
+        // (optional) sort/optimize the nodes in the graph
+        void                      (*optimize_graph)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     };
 
     struct ggml_backend {
index f615ab4beecb147fc4edb6c5e3ac74d551e5cdf2..7646f3f1346a44ac67663512cf3148628c24cb5b 100644 (file)
@@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
     backend->iface.event_wait(backend, event);
 }
 
+static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    GGML_ASSERT(backend);
+    if (backend->iface.optimize_graph != NULL) {
+        backend->iface.optimize_graph(backend, cgraph);
+    }
+}
+
 // Backend device
 
 const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
@@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
         struct ggml_backend_sched_split * split = &sched->splits[i];
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
 
+        // Optimize this split of the graph. This needs to happen before we make graph_copy,
+        // so they are in sync.
+        ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
+
         // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
             assert(graph_copy->size > (graph_copy->n_nodes + 1));
index aeac2e57449a25236902f9f6a9f5ae4fdcedcfd0..cdfc5a9bc2340f647c2007392c0e5fec1cc57d4c 100644 (file)
@@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
     /* .graph_compute           = */ ggml_backend_blas_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_blas_guid(void) {
index 2f9f373f54077df4f552bc1c4c14889dc9bd3436..2e47ad90afda41154a5003c48929ad3d3074a060 100755 (executable)
@@ -2690,6 +2690,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
     /* .graph_compute           = */ ggml_backend_cann_graph_compute,
     /* .event_record            = */ ggml_backend_cann_event_record,
     /* .event_wait              = */ ggml_backend_cann_event_wait,
+    /* .optimize_graph          = */ NULL,
 };
 
 /**
index 0c15165f1b88e128c5a50cee9d26d2c095047604..2b81f8b9afa22349889fd1f37838de1b081c4726 100644 (file)
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_cpu_guid(void) {
index 0c6bd363961f1c49fd805115c578a8a284624164..efca2c775a4006df3a06f27ad6e1e9831644b746 100644 (file)
@@ -3135,6 +3135,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
+    /* .optimize_graph          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_cuda_guid() {
index eeb6c9d4b70f2454899430c02aaa2fe1738b829a..e76fb712631e1332e60c402572774d6f3b6f0dd6 100644 (file)
@@ -6275,6 +6275,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_metal_guid(void) {
index 727163b7fdf950e010b975e4b449c3e6d5af723a..b188c5af34562f85f23d830b758878506ceabcdb 100644 (file)
@@ -2838,6 +2838,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
     /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 ggml_backend_t ggml_backend_opencl_init(void) {
index e84ff93efc32cc77096778bde45bb91af11e272c..d4833068d0016261cd9dbbdc6b34dcf4166d39c0 100644 (file)
@@ -795,6 +795,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
     /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
index 877fbf7e8626d11f1d5b3fa596be42151aadfcde..619ccaefc0bf8de88dc4f69467c946453db08eeb 100644 (file)
@@ -4063,6 +4063,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
     /* .event_record            = */ ggml_backend_sycl_event_record,
     /* .event_wait              = */ ggml_backend_sycl_event_wait,
+    /* .optimize_graph          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_sycl_guid() {
index e6245d0cfca4fae3dbeed79d08f2e4e145f07c5c..f0fa9e668c1d2ce2625d321e3a789cbadfe837c9 100644 (file)
@@ -583,6 +583,7 @@ struct vk_device_struct {
     bool disable_fusion;
     bool disable_host_visible_vidmem;
     bool allow_sysmem_fallback;
+    bool disable_optimize_graph;
 
 #ifdef GGML_VULKAN_MEMORY_DEBUG
     std::unique_ptr<vk_memory_logger> memory_logger;
@@ -3592,6 +3593,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
         const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK");
         device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr;
 
+        const char* GGML_VK_DISABLE_OPTIMIZE_GRAPH = getenv("GGML_VK_DISABLE_OPTIMIZE_GRAPH");
+        device->disable_optimize_graph = GGML_VK_DISABLE_OPTIMIZE_GRAPH != nullptr;
+
         bool fp16_storage = false;
         bool fp16_compute = false;
         bool maintenance4_support = false;
@@ -11853,6 +11857,131 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     UNUSED(backend);
 }
 
+// Sort the graph for improved parallelism.
+static void ggml_vk_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * graph)
+{
+    VK_LOG_DEBUG("ggml_vk_optimize_graph(" << graph->n_nodes << " nodes)");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+    if (ctx->device->disable_optimize_graph) {
+        return;
+    }
+
+    auto const &is_empty = [](ggml_tensor * node) -> bool {
+        return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
+    };
+
+    auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool {
+        for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
+            if (dst->src[s] == src) {
+                return true;
+            }
+        }
+        // implicit dependency if they view the same tensor
+        const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst;
+        const ggml_tensor *src2 = src->view_src ? src->view_src : src;
+        if (dst2 == src2) {
+            return true;
+        }
+        return false;
+    };
+
+    // This function tries to reorder the graph to allow nodes to run in parallel.
+    // This helps with small batches, but for large batches its a slowdown, probably
+    // due to cache contention. So only reorder if the majority of nodes have few rows.
+    int num_small_nodes = 0;
+    int num_counted_nodes = 0;
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        if (!is_empty(graph->nodes[i]) &&
+            graph->nodes[i]->op != GGML_OP_SET_ROWS) {
+            if (ggml_nrows(graph->nodes[i]) <= 8) {
+                num_small_nodes++;
+            }
+            num_counted_nodes++;
+        }
+    }
+    if (num_small_nodes < num_counted_nodes / 2) {
+        return;
+    }
+
+    std::vector<ggml_tensor *> new_order;
+    std::vector<bool> used(graph->n_nodes, false);
+    int first_unused = 0;
+    while (first_unused < graph->n_nodes) {
+        std::vector<int> current_set;
+
+        // First, grab the next unused node.
+        current_set.push_back(first_unused);
+
+        // Loop through the next N nodes. Grab any that don't depend on other nodes that
+        // haven't already been run. Nodes that have already been run have used[i] set
+        // to true. Allow nodes that depend on the previous node if it's a fusion pattern
+        // that we support (e.g. RMS_NORM + MUL).
+        // This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes.
+        // The goal is to not interleave real and view nodes in a way that breaks fusion.
+        const int NUM_TO_CHECK = 20;
+        for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) {
+            if (used[j]) {
+                continue;
+            }
+            if (is_empty(graph->nodes[j])) {
+                continue;
+            }
+            bool ok = true;
+            for (int c = first_unused; c < j; ++c) {
+                if (!used[c] &&
+                    is_src_of(graph->nodes[j], graph->nodes[c]) &&
+                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL)) {
+                    ok = false;
+                    break;
+                }
+            }
+            if (ok) {
+                current_set.push_back(j);
+            }
+        }
+        // Second pass grabs view nodes.
+        // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add).
+        if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) {
+            for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) {
+                if (used[j]) {
+                    continue;
+                }
+                if (!is_empty(graph->nodes[j])) {
+                    continue;
+                }
+                bool ok = true;
+                for (int c = first_unused; c < j; ++c) {
+                    bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end();
+                    // skip views whose srcs haven't been processed.
+                    if (!used[c] &&
+                        is_src_of(graph->nodes[j], graph->nodes[c]) &&
+                        !c_in_current_set) {
+                        ok = false;
+                        break;
+                    }
+                }
+                if (ok) {
+                    current_set.push_back(j);
+                }
+            }
+        }
+
+        // Push the current set into new_order
+        for (auto c : current_set) {
+            new_order.push_back(graph->nodes[c]);
+            used[c] = true;
+        }
+        while (first_unused < graph->n_nodes && used[first_unused]) {
+            first_unused++;
+        }
+    }
+    // Replace the graph with the new order.
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        graph->nodes[i] = new_order[i];
+    }
+}
+
 // TODO: enable async and synchronize
 static ggml_backend_i ggml_backend_vk_interface = {
     /* .get_name                = */ ggml_backend_vk_name,
@@ -11868,6 +11997,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ ggml_vk_optimize_graph,
 };
 
 static ggml_guid_t ggml_backend_vk_guid() {
index aad27bf605bce30a1d3fc294e78ab2b9cde74f91..df6a3ed95a82b5ebaa3a45c035e6f5801dfbae0e 100644 (file)
@@ -665,6 +665,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
     /* .graph_compute           = */ ggml_backend_webgpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };
 
 /* End GGML Backend Interface */
index 7507a52aeae318748daaa4b11610771f062162e2..a4c51ab4e7c4a41931dcc9e53a3160ca400b45bc 100644 (file)
@@ -586,6 +586,7 @@ static ggml_backend_i ggml_backend_zdnn_i = {
     /* .graph_compute      = */ ggml_backend_zdnn_graph_compute,
     /* .event_record       = */ NULL,
     /* .event_wait         = */ NULL,
+    /* .optimize_graph     = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_zdnn_guid(void) {