CUDA: skip fusion for repeating adds in bias (#17080)

author Aman Gupta <redacted>

Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)

committer GitHub <redacted>

Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
author Aman Gupta <redacted>
Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
committer GitHub <redacted>
Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt

index 3024775135966133a9afe7857f687aa75640acf1..67af1d8ccc1822fb70527a8ac11f9e5a560865c1 100644 (file)
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
  
      if (GGML_CUDA_DEBUG)
          list(APPEND CUDA_FLAGS -lineinfo)
+        add_compile_definitions(GGML_CUDA_DEBUG)
      endif()
  
      if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 049aece1b5234944418b76add805283149d1be13..2d4314fba4fdc97a020c0c1d169d47f43acffd6a 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
  
              for (int i = 0; i < cgraph->n_nodes; i++) {
                  ggml_tensor * node = cgraph->nodes[i];
-
-
  #ifdef GGML_CUDA_DEBUG
                  const int nodes_fused = i - prev_i - 1;
                  prev_i = i;
@@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                                  continue;
                              }
  
+                            // we don't support repeating adds
+                            if (bias_op == GGML_OP_ADD &&
+                                (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
+                                 !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
+                                continue;
+                            }
+
                              const ggml_tensor * src0 = up_n->src[0];
                              const ggml_tensor * src1 = up_n->src[1];
                              const ggml_tensor * ids  = up_n->src[2];
@@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                              continue;
                          }
  
+                        if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
+                            continue;
+                        }
+
                          ggml_cuda_mm_fusion_args_host fusion_data{};
                          fusion_data.x_bias = bias_tensor;
  
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 80216f6f3a2a66a0cc81ac5b7da671236c604291..31625bcc7a44bfa88dc18e01e7b361f6e5e16157 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4984,8 +4984,10 @@ struct test_mul_mat_vec_fusion : public test_case {
  
      ggml_tensor * build_graph(ggml_context * ctx) override {
          if (!use_id) {
-            std::array<int64_t, 4> ne = {k, m, 1, 1};
-            std::array<int64_t, 4> ne0 = {k, n, 1, 1};
+            const int              channels = 4;
+            const int              samples  = 2;
+            std::array<int64_t, 4> ne       = { k, m, channels, samples };
+            std::array<int64_t, 4> ne0      = { k, n, channels, samples };
  
              ggml_tensor * cur  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
              ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
@@ -4993,14 +4995,14 @@ struct test_mul_mat_vec_fusion : public test_case {
  
              ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
              if (with_bias) {
-                std::array<int64_t, 4> bias_ne = {ffn_up->ne[0], 1, 1, 1};
+                std::array<int64_t, 4> bias_ne = { ffn_up->ne[0], 1, channels, samples };
                  ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
                  ffn_up = ggml_add(ctx, ffn_up, up_bias);
              }
  
              ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
              if (with_bias && with_gate) {
-                std::array<int64_t, 4> bias_ne = {ffn_gate->ne[0], 1, 1, 1};
+                std::array<int64_t, 4> bias_ne   = { ffn_gate->ne[0], 1, channels, samples };
                  ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
                  ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
              }
author	Aman Gupta <redacted>
	Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
committer	GitHub <redacted>
	Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
ggml/src/ggml-cuda/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history