vulkan: allow graph_optimize for prompt processing workloads (llama/17475)

author Jeff Bolz <redacted>

Wed, 26 Nov 2025 15:46:33 +0000 (09:46 -0600)

committer Georgi Gerganov <redacted>

Thu, 11 Dec 2025 13:32:46 +0000 (15:32 +0200)
author Jeff Bolz <redacted>
Wed, 26 Nov 2025 15:46:33 +0000 (09:46 -0600)
committer Georgi Gerganov <redacted>
Thu, 11 Dec 2025 13:32:46 +0000 (15:32 +0200)
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp

index 9c97f0a6faf7ebeb4894412cf43ab4d6721217f2..7f2cf795c98802463ddff8e5245e7a264273c664 100644 (file)
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13158,24 +13158,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
          return false;
      };
  
-    // This function tries to reorder the graph to allow nodes to run in parallel.
-    // This helps with small batches, but for large batches its a slowdown, probably
-    // due to cache contention. So only reorder if the majority of nodes have few rows.
-    int num_small_nodes = 0;
-    int num_counted_nodes = 0;
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        if (!is_empty(graph->nodes[i]) &&
-            graph->nodes[i]->op != GGML_OP_SET_ROWS) {
-            if (ggml_nrows(graph->nodes[i]) <= 8) {
-                num_small_nodes++;
-            }
-            num_counted_nodes++;
-        }
-    }
-    if (num_small_nodes < num_counted_nodes / 2) {
-        return;
-    }
-
      std::vector<ggml_tensor *> new_order;
      std::vector<bool> used(graph->n_nodes, false);
      std::set<ggml_tensor *> used_node_set;
author	Jeff Bolz <redacted>
	Wed, 26 Nov 2025 15:46:33 +0000 (09:46 -0600)
committer	Georgi Gerganov <redacted>
	Thu, 11 Dec 2025 13:32:46 +0000 (15:32 +0200)