i01_high = row_high % ne01;
}
}
+
+ // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
+ // Removing the first assert or changing the order of the arguments causes the second assert to fail.
+ // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
+ // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
+ GGML_ASSERT(i01_low == 0 || g_device_count > 1);
+ GGML_ASSERT(i01_high == ne01 || g_device_count > 1);
+
const int64_t i01_diff = i01_high - i01_low;
if (i01_diff == 0) {
continue;
row_low -= row_low % GGML_CUDA_DMMV_Y;
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
row_high -= row_high % GGML_CUDA_DMMV_Y;
+ GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
} else {
GGML_ASSERT(false);
}