ggml-cuda: use passed ops instead of hardcoded ops (llama/16712)

author Aman Gupta <redacted>

Thu, 23 Oct 2025 11:14:06 +0000 (19:14 +0800)

committer Georgi Gerganov <redacted>

Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
author Aman Gupta <redacted>
Thu, 23 Oct 2025 11:14:06 +0000 (19:14 +0800)
committer Georgi Gerganov <redacted>
Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index 6e7c5aedbc55a55937575701f49a91d38105300c..f5a6a751acfd5293e75389d59ac9224a32015e36 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
  
      if (ops.size() == topk_moe_ops_with_norm.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx+8];
  
@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      }
  
      if (ops.size() == topk_moe_ops.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx+4];
          if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      }
  
      if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
          ggml_tensor * weights = cgraph->nodes[node_idx + 5];
author	Aman Gupta <redacted>
	Thu, 23 Oct 2025 11:14:06 +0000 (19:14 +0800)
committer	Georgi Gerganov <redacted>
	Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)