llama-graph: avoid expand_forward for fusion (#17633)

author Aman Gupta <redacted>

Mon, 1 Dec 2025 09:12:48 +0000 (17:12 +0800)

committer GitHub <redacted>

Mon, 1 Dec 2025 09:12:48 +0000 (11:12 +0200)
author Aman Gupta <redacted>
Mon, 1 Dec 2025 09:12:48 +0000 (17:12 +0800)
committer GitHub <redacted>
Mon, 1 Dec 2025 09:12:48 +0000 (11:12 +0200)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index fa7e1e13a71ab6f955d0091e86a436203f5bec0b..eb2e273110dff691c76ad841a080c351ffa3b4a0 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3274,7 +3274,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                          GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                      }
                  }
-                prev_i = i;
  
  #ifdef GGML_CUDA_DEBUG
                  const int nodes_fused = i - prev_i - 1;
@@ -3282,6 +3281,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                      GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
                  }
  #endif
+                prev_i = i;
  
                  if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                      continue;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 1d012e09abab50aa236a30fd6840e1ad7a79a759..452a956c43de9118511fd29958a252ab5144f664 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn(
              GGML_ABORT("fatal error");
      }
  
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
      if (gate && type_gate == LLM_FFN_PAR) {
          cur = ggml_mul(ctx0, cur, tmp);
          cb(cur, "ffn_gate_par", il);
@@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              GGML_ABORT("fatal error");
      }
  
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
      experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
      cb(experts, "ffn_moe_down", il);
author	Aman Gupta <redacted>
	Mon, 1 Dec 2025 09:12:48 +0000 (17:12 +0800)
committer	GitHub <redacted>
	Mon, 1 Dec 2025 09:12:48 +0000 (11:12 +0200)
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history