CUDA: better error for FA kernel with 0 occupancy (llama/16643)

author Johannes Gäßler <redacted>

Tue, 21 Oct 2025 13:27:53 +0000 (15:27 +0200)

committer Georgi Gerganov <redacted>

Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
author Johannes Gäßler <redacted>
Tue, 21 Oct 2025 13:27:53 +0000 (15:27 +0200)
committer Georgi Gerganov <redacted>
Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)
diff --git a/src/ggml-cuda/fattn-common.cuh b/src/ggml-cuda/fattn-common.cuh

index bc0c2523cc82f7b790c603c67a5055f5eb435466..218ccff14e7cc6bf1a883cafc22546142c01484a 100644 (file)
--- a/src/ggml-cuda/fattn-common.cuh
+++ b/src/ggml-cuda/fattn-common.cuh
@@ -895,6 +895,7 @@ void launch_fattn(
      const dim3 block_dim(warp_size, nwarps, 1);
      int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
      CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+    GGML_ASSERT(max_blocks_per_sm > 0);
      int parallel_blocks = max_blocks_per_sm;
  
      dim3 blocks_num;
author	Johannes Gäßler <redacted>
	Tue, 21 Oct 2025 13:27:53 +0000 (15:27 +0200)
committer	Georgi Gerganov <redacted>
	Sat, 1 Nov 2025 07:41:35 +0000 (09:41 +0200)