cuda : fix tensor size calculation for non-split buffer (llama/5145)

author slaren <redacted>

Fri, 26 Jan 2024 17:59:43 +0000 (18:59 +0100)

committer Georgi Gerganov <redacted>

Sat, 27 Jan 2024 15:19:52 +0000 (17:19 +0200)
author slaren <redacted>
Fri, 26 Jan 2024 17:59:43 +0000 (18:59 +0100)
committer Georgi Gerganov <redacted>
Sat, 27 Jan 2024 15:19:52 +0000 (17:19 +0200)
diff --git a/ggml-backend.c b/ggml-backend.c

index 423512defc132450efad523c73bf1a71f5109147..3fff5fc87f7472748091df3904a57183f035ab63 100644 (file)
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
      // get_alloc_size is optional, defaults to ggml_nbytes
      if (buft->iface.get_alloc_size) {
-        return buft->iface.get_alloc_size(buft, tensor);
+        size_t size = buft->iface.get_alloc_size(buft, tensor);
+        assert(size >= ggml_nbytes(tensor));
+        return size;
      }
      return ggml_nbytes(tensor);
  }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index 05e5d18ab48e222c63143610d4e8c11527cccc7b..0d599e20a9685da571b07d8505a1312c694bb470 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
      // TODO: mmq/mmv support
  #endif
  
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
+    const size_t nb11 = src1->nb[1];
+    const size_t nb1  =  dst->nb[1];
  
      const struct ggml_tensor * ids = src0;
      const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
  
      if (ggml_is_quantized(tensor->type)) {
          // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t original_size = ggml_nbytes(tensor);
          size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
  
          if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
          }
      }
  }
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
  }
  
  GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
+    size_t size = ggml_nbytes(tensor);
      int64_t ne0 = tensor->ne[0];
  
      if (ggml_is_quantized(tensor->type)) {
author	slaren <redacted>
	Fri, 26 Jan 2024 17:59:43 +0000 (18:59 +0100)
committer	Georgi Gerganov <redacted>
	Sat, 27 Jan 2024 15:19:52 +0000 (17:19 +0200)
ggml-backend.c		patch \| blob \| history
ggml-cuda.cu		patch \| blob \| history