cuda : prevent using split buffers with 3d/4d matrices (llama/13919)

author Diego Devesa <redacted>

Fri, 30 May 2025 14:37:18 +0000 (07:37 -0700)

committer Georgi Gerganov <redacted>

Sun, 1 Jun 2025 11:01:05 +0000 (14:01 +0300)
author Diego Devesa <redacted>
Fri, 30 May 2025 14:37:18 +0000 (07:37 -0700)
committer Georgi Gerganov <redacted>
Sun, 1 Jun 2025 11:01:05 +0000 (14:01 +0300)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index c442a64924303146040aa58016d59482d16b2363..009ed9048dad3c911829f2763a5c31c96fb1a396 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -2994,9 +2994,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
              {
                  struct ggml_tensor * a = op->src[0];
                  struct ggml_tensor * b = op->src[1];
-                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
-                // this avoids some edge cases (and the performance would not be good anyways)
                  if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    if (a->ne[2] > 1 || a->ne[3] > 1) {
+                        return false;
+                    }
+                    // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                    // this avoids some edge cases (and the performance would not be good anyways)
                      ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
                      int64_t row_low;
                      int64_t row_high;
author	Diego Devesa <redacted>
	Fri, 30 May 2025 14:37:18 +0000 (07:37 -0700)
committer	Georgi Gerganov <redacted>
	Sun, 1 Jun 2025 11:01:05 +0000 (14:01 +0300)