* CUDA: fix should_use_mmvf for ne11 == 1
* Apply suggestion from @am17an
Co-authored-by: Aman Gupta <redacted>
---------
Co-authored-by: Aman Gupta <redacted>
if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
return false;
}
- for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+
+ if (src0_nb[0] != ts) {
+ return false;
+ }
+
+ // Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
+ for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
if (src0_nb[i] % (2*ts) != 0) {
return false;
}
if (src0_ne[0] % 2 != 0) {
return false;
}
+
const size_t ts = ggml_type_size(type);
- for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+ if (src0_nb[0] != ts) {
+ return false;
+ }
+
+ // Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
+ for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
if (src0_nb[i] % (2*ts) != 0) {
return false;
}
}
+
switch (type) {
case GGML_TYPE_F32:
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {