CUDA: no -sm row for very small matrices (#10185)

author Johannes Gäßler <redacted>

Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)

committer GitHub <redacted>

Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)
author Johannes Gäßler <redacted>
Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)
committer GitHub <redacted>
Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu

index 357cee660cd38f2ea80840ed21c9cb63056cc9e8..b5096b3ee6bace02da13b2ae3f023624bb1976c8 100644 (file)
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2978,6 +2978,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
              {
                  struct ggml_tensor * a = op->src[0];
                  struct ggml_tensor * b = op->src[1];
+                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                // this avoids some edge cases (and the performance would not be good anyways)
+                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
+                    int64_t row_low;
+                    int64_t row_high;
+                    get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
+                    if (row_low == row_high) {
+                        return false;
+                    }
+                }
                  if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                      return false;
                  }
author	Johannes Gäßler <redacted>
	Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)
committer	GitHub <redacted>
	Thu, 14 Nov 2024 12:00:15 +0000 (13:00 +0100)