static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+ // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+ // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+ // Therefore, in such cases use cuBLAS.
+ const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+ && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
- bool use_mul_mat_q = ggml_is_quantized(src0->type)
+ bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
- GGML_ASSERT(ggml_is_contiguously_allocated(src0));
- GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+ GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
- GGML_ASSERT(ggml_is_contiguously_allocated(src0));
- GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+ GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}