From: Sigbjørn Skjæret Date: Thu, 14 Aug 2025 10:22:07 +0000 (+0200) Subject: cuda : fix GGML_CUDA_GRAPHS=OFF (#15300) X-Git-Tag: upstream/0.0.6164~9 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=4ebd0c125b24a0d7a78b0ffc1d9567530ed8f0c4;p=pkg%2Fggml%2Fsources%2Fllama.cpp cuda : fix GGML_CUDA_GRAPHS=OFF (#15300) * fix USE_CUDA_GRAPH=OFF ggml-ci * check capture status * completely disable capturing check instead --- diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 2ad49323..347abc18 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // Special case for reducing vectors #ifdef GGML_CUDA_USE_CUB +#ifdef USE_CUDA_GRAPH cudaStreamCaptureStatus iscapturing; CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing)); +#endif // USE_CUDA_GRAPH if ((nrows == 1) && +#ifdef USE_CUDA_GRAPH // CUDA_GRAPHS_DISABLED ((ncols > 65536) && ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || @@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || ctx.cuda_graph->disable_due_to_failed_graph_capture))) { +#else + (ncols > 65536)) { +#endif // USE_CUDA_GRAPH // Single row - use device-wide reduction size_t tmp_size = 0; ggml_cuda_pool & pool = ctx.pool(); @@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { divide_by_count<<<1, 1, 0, stream>>>(dst_d, ncols); return; } -#endif +#endif // GGML_CUDA_USE_CUB const dim3 block_nums(nrows, 1, 1);