CUDA : Fix CUB's argsort when nrows % block_size == 0 CCCL < 3.1 (llama/21181)

author Oliver Simons <redacted>

Mon, 30 Mar 2026 14:20:00 +0000 (16:20 +0200)

committer Georgi Gerganov <redacted>

Wed, 1 Apr 2026 13:00:26 +0000 (16:00 +0300)
author Oliver Simons <redacted>
Mon, 30 Mar 2026 14:20:00 +0000 (16:20 +0200)
committer Georgi Gerganov <redacted>
Wed, 1 Apr 2026 13:00:26 +0000 (16:00 +0300)
diff --git a/src/ggml-cuda/argsort.cu b/src/ggml-cuda/argsort.cu

index 4896669c32a848b0999864dc2708be67ed1ded12..38fdf3678c115fb1d855ad1a2895b7b886a229d3 100644 (file)
--- a/src/ggml-cuda/argsort.cu
+++ b/src/ggml-cuda/argsort.cu
@@ -47,9 +47,11 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
  #ifdef STRIDED_ITERATOR_AVAILABLE
      auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
  #else
-    ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
+    // offset_iterator needs to populate nrows + 1 elements, so we also have to ceildiv nrows + 1 by block_size
+    const int                 nrows_offset = nrows + 1;
+    ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows_offset);
      int *                     offset_iterator = offsets_alloc.get();
-    const dim3                offset_grid((nrows + block_size - 1) / block_size);
+    const dim3                offset_grid((nrows_offset + block_size - 1) / block_size);
      init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows);
  #endif
      CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 6a4f9b634b222643ab6bf376851a8bf71e8dc46b..781c621d930dcd79a8a6ce20a41d71099af1007f 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -8424,6 +8424,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order));
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order));
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 256, 1, 1}, order)); // test ceildiv in CUDA's CUB's DeviceSegmentedSort
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order));
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order));
          test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order));
author	Oliver Simons <redacted>
	Mon, 30 Mar 2026 14:20:00 +0000 (16:20 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 1 Apr 2026 13:00:26 +0000 (16:00 +0300)
src/ggml-cuda/argsort.cu		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history