CUDA: fix crash with partial offloading of MoE (#13439)

author Johannes Gäßler <redacted>

Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)

committer GitHub <redacted>

Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)
author Johannes Gäßler <redacted>
Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)
committer GitHub <redacted>
Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 7643c4b8bfa2b148bb331c3eb7dcf4bb0b1be42d..b4b85abcda9e38cc06921061c38a985912ef9e89 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
  
+    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+    // Therefore, in such cases use cuBLAS.
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
      bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
          && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
          && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
          && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
          && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
          && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
  
      bool any_gpus_with_slow_fp16   = false;
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu

index b4962e6a51c8b9dd0a7b97f153347fb0a8d3f294..e1cf843de1a652d96977bb15913ef3dcca8853bb 100644 (file)
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
  
      // If src0 is a temporary compute buffer, clear any potential padding.
      if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
          const size_t size_data  = ggml_nbytes(src0);
          const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
          if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
              CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
          }
      }
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu

index 3b313ea2953c1a93ab385f1514a9e185279aa941..dc7adf509fac086ef4d4a2dad33fec8261afea02 100644 (file)
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
  
      // If src0 is a temporary compute buffer, clear any potential padding.
      if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
          const size_t size_data  = ggml_nbytes(src0);
          const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
          if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
              CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
          }
      }
author	Johannes Gäßler <redacted>
	Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)
committer	GitHub <redacted>
	Sun, 11 May 2025 14:09:33 +0000 (16:09 +0200)
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/mmq.cu		patch \| blob \| history
ggml/src/ggml-cuda/mmvq.cu		patch \| blob \| history