// Special case for reducing vectors
#ifdef GGML_CUDA_USE_CUB
+#ifdef USE_CUDA_GRAPH
cudaStreamCaptureStatus iscapturing;
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
+#endif // USE_CUDA_GRAPH
if ((nrows == 1) &&
+#ifdef USE_CUDA_GRAPH
// CUDA_GRAPHS_DISABLED
((ncols > 65536) &&
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
+#else
+ (ncols > 65536)) {
+#endif // USE_CUDA_GRAPH
// Single row - use device-wide reduction
size_t tmp_size = 0;
ggml_cuda_pool & pool = ctx.pool();
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
return;
}
-#endif
+#endif // GGML_CUDA_USE_CUB
const dim3 block_nums(nrows, 1, 1);