{
struct ggml_tensor * a = op->src[0];
struct ggml_tensor * b = op->src[1];
- // for small weight matrices the active device can end up without any rows, don't use row split in those cases
- // this avoids some edge cases (and the performance would not be good anyways)
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+ if (a->ne[2] > 1 || a->ne[3] > 1) {
+ return false;
+ }
+ // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+ // this avoids some edge cases (and the performance would not be good anyways)
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
int64_t row_low;
int64_t row_high;