ggml-cuda: add stricter checking for fusion (llama/17568)

author Aman Gupta <redacted>

Fri, 28 Nov 2025 12:34:51 +0000 (20:34 +0800)

committer Georgi Gerganov <redacted>

Thu, 11 Dec 2025 13:32:49 +0000 (15:32 +0200)
author Aman Gupta <redacted>
Fri, 28 Nov 2025 12:34:51 +0000 (20:34 +0800)
committer Georgi Gerganov <redacted>
Thu, 11 Dec 2025 13:32:49 +0000 (15:32 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index 6463921a6e74b2d1eac98fd920c1219cf5699ebc..a844a3d99a2fcff6f152359dc93d9213e0f6c71a 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -3050,7 +3050,12 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
          ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
  
-    if (ops.size() == topk_moe_ops_with_norm.size() &&
+    const auto is_equal = [](const std::initializer_list<enum ggml_op> & list1,
+                             const std::initializer_list<enum ggml_op> & list2) {
+        return std::equal(list1.begin(), list1.end(), list2.begin(), list2.end());
+    };
+
+    if (is_equal(topk_moe_ops_with_norm, ops) &&
          ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx + 9];
@@ -3060,8 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          }
      }
  
-    if (ops.size() == topk_moe_ops.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
+    if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx + 4];
          if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -3069,7 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          }
      }
  
-    if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
+    if (is_equal(topk_moe_ops_delayed_softmax, ops) &&
          ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
          ggml_tensor * weights = cgraph->nodes[node_idx + 5];
@@ -3085,9 +3089,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
      std::initializer_list<enum ggml_op> mul_mat_glu_ops    = { GGML_OP_MUL_MAT,    GGML_OP_MUL_MAT,    GGML_OP_GLU };
  
-    if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
-                            ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
-
+    if ((is_equal(mul_mat_bias_glu_ops, ops) || is_equal(mul_mat_id_bias_glu_ops, ops)) &&
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 4 })) {
          const ggml_tensor * ffn_gate      = cgraph->nodes[node_idx];
          const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
          const ggml_tensor * ffn_up        = cgraph->nodes[node_idx + 2];
@@ -3099,9 +3102,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          }
      }
  
-    if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
-                            ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
-
+    if ((is_equal(mul_mat_id_glu_ops, ops) || is_equal(mul_mat_glu_ops, ops)) &&
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
          const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
          const ggml_tensor * ffn_up   = cgraph->nodes[node_idx + 1];
          const ggml_tensor * glu      = cgraph->nodes[node_idx + 2];
@@ -3111,7 +3113,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          }
      }
  
-    if (ops.size() == 3 && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
+    std::initializer_list<enum ggml_op> rope_set_rows_ops = { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS };
+
+    if (is_equal(rope_set_rows_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
          const ggml_tensor * rope     = cgraph->nodes[node_idx];
          const ggml_tensor * view     = cgraph->nodes[node_idx + 1];
          const ggml_tensor * set_rows = cgraph->nodes[node_idx + 2];
author	Aman Gupta <redacted>
	Fri, 28 Nov 2025 12:34:51 +0000 (20:34 +0800)
committer	Georgi Gerganov <redacted>
	Thu, 11 Dec 2025 13:32:49 +0000 (15:32 +0200)