CUDA: fix bad asserts for partial offload (llama/13337)

author Johannes Gäßler <redacted>

Tue, 6 May 2025 11:58:51 +0000 (13:58 +0200)

committer Georgi Gerganov <redacted>

Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)
author Johannes Gäßler <redacted>
Tue, 6 May 2025 11:58:51 +0000 (13:58 +0200)
committer Georgi Gerganov <redacted>
Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 1b8603e78e55348c9fcd4db8c39451a55b47c295..c518366d58a7a3e7230feab31a255be4a13b47d1 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -673,11 +673,15 @@ extern "C" {
      GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
      GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
  
+    // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
      GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
      GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
      GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
      GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
  
+    // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
+    GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
+
      // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
      GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
  
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh

index 56121705bdf3fb7f1b96538102019577a9f4239a..c7dc728821e5984ba589afaccae84be3773beb68 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -719,6 +719,7 @@ void launch_fattn(
      size_t nb23 = V->nb[3];
  
      if (need_f16_K && K->type != GGML_TYPE_F16) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(K));
          K_f16.alloc(ggml_nelements(K));
          to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
          to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -733,6 +734,7 @@ void launch_fattn(
      }
  
      if (need_f16_V && V->type != GGML_TYPE_F16) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(V));
          V_f16.alloc(ggml_nelements(V));
          to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
          to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 0d9ee0a26ddd0067b1bb98375e4c6c8e66e73f82..42302e4ecc68a123717d8986d68febc1eb367069 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1536,6 +1536,8 @@ static void ggml_cuda_op_mul_mat(
  
          // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
          if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
              const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
              const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
              CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
@@ -2067,10 +2069,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
          }
  
          ggml_tensor src0_slice = *src0;
-        src0_slice.ne[2] = 1;
-        src0_slice.nb[3] = src0_slice.nb[2];
-        src0_slice.data  = (char *) src0->data + i02*nb02;
-        GGML_ASSERT(!ggml_cuda_should_use_mmq(src0->type, cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0);
+        src0_slice.ne[2]    = 1;
+        src0_slice.nb[3]    = src0_slice.nb[2];
+        src0_slice.op       = GGML_OP_VIEW;
+        src0_slice.view_src = dst->src[0]; // non-const pointer to src0
+        src0_slice.data     = (char *) src0->data + i02*nb02;
  
          ggml_tensor src1_slice;
          memset(&src1_slice, 0, sizeof(src1_slice));
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu

index 347d27d552573ddb32e3fcb3d2744ded8476bda0..b4962e6a51c8b9dd0a7b97f153347fb0a8d3f294 100644 (file)
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -91,7 +91,8 @@ void ggml_cuda_mul_mat_q(
  
      // If src0 is a temporary compute buffer, clear any potential padding.
      if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguous(src0));
+        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+        GGML_ASSERT(!src0->view_src);
          const size_t size_data  = ggml_nbytes(src0);
          const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
          if (size_alloc > size_data) {
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu

index 4bb51d27e434acf012d4985835b7105c9d52e25d..3b313ea2953c1a93ab385f1514a9e185279aa941 100644 (file)
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -515,7 +515,8 @@ void ggml_cuda_mul_mat_vec_q(
  
      // If src0 is a temporary compute buffer, clear any potential padding.
      if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguous(src0));
+        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+        GGML_ASSERT(!src0->view_src);
          const size_t size_data  = ggml_nbytes(src0);
          const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
          if (size_alloc > size_data) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 7654ae1779b1daca1890b58565eda3e2a0c2bfe6..ee4fe9f723dc1e3c6d9a0b060bb0258fb3d4958e 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1299,6 +1299,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
      return ggml_is_contiguous_n(tensor, 2);
  }
  
+bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+}
+
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
      static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
author	Johannes Gäßler <redacted>
	Tue, 6 May 2025 11:58:51 +0000 (13:58 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml-cuda/fattn-common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/mmq.cu		patch \| blob \| history
ggml/src/ggml-cuda/mmvq.cu		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history