CUDA: Fix bug in topk-moe for gpt-oss (llama/16821)

author Aman Gupta <redacted>

Wed, 29 Oct 2025 07:55:06 +0000 (15:55 +0800)

committer Georgi Gerganov <redacted>

Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
author Aman Gupta <redacted>
Wed, 29 Oct 2025 07:55:06 +0000 (15:55 +0800)
committer Georgi Gerganov <redacted>
Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index be505748af5a4287379404b64ad05ec645957e99..fcff5d7cdc1f5cef9540b111a32afcddffbdd76f 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -2978,7 +2978,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
  
      if (ops.size() == topk_moe_ops_with_norm.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx + 9];
  
@@ -2997,7 +2997,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      }
  
      if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
          ggml_tensor * weights = cgraph->nodes[node_idx + 5];
  
@@ -3118,9 +3118,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
          // With the use of CUDA graphs, the execution will be performed by the graph launch.
          if (!use_cuda_graph || cuda_graph_update_required) {
  
+            [[maybe_unused]] int prev_i = 0;
+
              for (int i = 0; i < cgraph->n_nodes; i++) {
                  ggml_tensor * node = cgraph->nodes[i];
  
+
+#ifdef GGML_CUDA_DEBUG
+                const int nodes_fused = i - prev_i - 1;
+                prev_i = i;
+                if (nodes_fused > 0) {
+                    GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
+                }
+#endif
+
                  if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                      continue;
                  }
author	Aman Gupta <redacted>
	Wed, 29 Oct 2025 07:55:06 +0000 (15:55 +0800)
committer	Georgi Gerganov <redacted>
	Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)