]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
CUDA: skip fusion for repeating adds in bias (llama/17080)
authorAman Gupta <redacted>
Sat, 8 Nov 2025 08:58:05 +0000 (16:58 +0800)
committerGeorgi Gerganov <redacted>
Sun, 9 Nov 2025 21:38:03 +0000 (23:38 +0200)
ggml/src/ggml-cuda/CMakeLists.txt
ggml/src/ggml-cuda/ggml-cuda.cu

index 3024775135966133a9afe7857f687aa75640acf1..67af1d8ccc1822fb70527a8ac11f9e5a560865c1 100644 (file)
@@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
 
     if (GGML_CUDA_DEBUG)
         list(APPEND CUDA_FLAGS -lineinfo)
+        add_compile_definitions(GGML_CUDA_DEBUG)
     endif()
 
     if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
index 049aece1b5234944418b76add805283149d1be13..2d4314fba4fdc97a020c0c1d169d47f43acffd6a 100644 (file)
@@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
-
-
 #ifdef GGML_CUDA_DEBUG
                 const int nodes_fused = i - prev_i - 1;
                 prev_i = i;
@@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                                 continue;
                             }
 
+                            // we don't support repeating adds
+                            if (bias_op == GGML_OP_ADD &&
+                                (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
+                                 !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
+                                continue;
+                            }
+
                             const ggml_tensor * src0 = up_n->src[0];
                             const ggml_tensor * src1 = up_n->src[1];
                             const ggml_tensor * ids  = up_n->src[2];
@@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                             continue;
                         }
 
+                        if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
+                            continue;
+                        }
+
                         ggml_cuda_mm_fusion_args_host fusion_data{};
                         fusion_data.x_bias = bias_tensor;