CUDA: properly handle nb00=nb02 case for cpy (llama/17081)

author bssrdf <redacted>

Fri, 7 Nov 2025 22:41:58 +0000 (17:41 -0500)

committer Georgi Gerganov <redacted>

Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)
author bssrdf <redacted>
Fri, 7 Nov 2025 22:41:58 +0000 (17:41 -0500)
committer Georgi Gerganov <redacted>
Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)
diff --git a/src/ggml-cuda/cpy.cu b/src/ggml-cuda/cpy.cu

index 1dba60eb143ef13ef4b9db8a9c8a109781cec1e7..50612237c8a23b9dee2d71203832da384a54e065 100644 (file)
--- a/src/ggml-cuda/cpy.cu
+++ b/src/ggml-cuda/cpy.cu
@@ -198,7 +198,7 @@ static void ggml_cpy_flt_cuda(
      if (transposed) {
          GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
          int ne00n, ne01n, ne02n;
-        if (nb00 < nb02) {
+        if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
              ne00n = ne00;
              ne01n = ne01;
              ne02n = ne02;
@@ -206,8 +206,6 @@ static void ggml_cpy_flt_cuda(
              ne00n = ne00;
              ne01n = ne01*ne02;
              ne02n = 1;
-        } else {
-            GGML_ASSERT(false);
          }
  
          dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index bffd60f386ae2bedf28959fe7270eaff17a020f4..b9ae82eeddd71e1443025a5e59045e90bf5e889d 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6648,6 +6648,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
      test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
      test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
      test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
  
      test_cases.emplace_back(new test_cont());
      test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
author	bssrdf <redacted>
	Fri, 7 Nov 2025 22:41:58 +0000 (17:41 -0500)
committer	Georgi Gerganov <redacted>
	Sun, 9 Nov 2025 16:30:22 +0000 (18:30 +0200)
src/ggml-cuda/cpy.cu		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history