CUDA: Faster Mixtral prompt processing (#4538)

author Johannes Gäßler <redacted>

Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)

committer GitHub <redacted>

Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)
author Johannes Gäßler <redacted>
Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)
committer GitHub <redacted>
Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index f20846fef8de60b4dfca50f0c66d32d3296ad8bd..9f4b188cbb0d6f24b8c783fe2b134889f336a960 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
      }
  
  #ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
      for (int id = 0; id < g_device_count; ++id) {
          CUDA_CHECK(ggml_cuda_set_device(id));
  
@@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
      const int nb2 = dst->nb[2];
      const int nb3 = dst->nb[3];
  
-    ggml_cuda_set_peer_access(ne11);
-
      GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
      GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
  
@@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
  
      GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
  
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb1  =  dst->nb[1];
+
      const struct ggml_tensor * ids = src0;
      const int32_t id = ((int32_t *) dst->op_params)[0];
      const int32_t n_as = ((int32_t *) dst->op_params)[1];
  
      std::vector<char> ids_host(ggml_nbytes(ids));
  
+    const cudaStream_t stream = g_cudaStreams[g_main_device][0];
+
      if (ids->backend == GGML_BACKEND_GPU) {
          const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
      } else {
          memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
      }
@@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
      ggml_tensor src1_row = *src1;
      ggml_tensor dst_row = *dst;
  
-    src1_row.ne[1] = 1;
-    dst_row.ne[1] = 1;
+    src1_row.extra = &src1_row_extra;
+    dst_row.extra = &dst_row_extra;
  
-    src1_row.nb[2] = src1_row.nb[1];
-    dst_row.nb[2] = dst_row.nb[1];
+    char * src1_original = (char *) src1_extra->data_device[g_main_device];
+    char * dst_original  = (char *)  dst_extra->data_device[g_main_device];
  
-    src1_row.nb[3] = src1_row.nb[1];
-    dst_row.nb[3] = dst_row.nb[1];
+    if (src1->ne[1] == 1) {
+        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+            //int32_t row_id;
+            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
  
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
+            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
  
+            GGML_ASSERT(row_id >= 0 && row_id < n_as);
  
-    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-        //int32_t row_id;
-        //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-        //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
  
-        const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
  
-        GGML_ASSERT(row_id >= 0 && row_id < n_as);
+            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
  
-        const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+        }
+    } else {
+        size_t as_src1, as_dst;
+        char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
+        char *  dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst),  &as_dst);
  
-        src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
-        src1_row.data = (char *) src1->data + i01*src1->nb[1];
+        src1_row_extra.data_device[g_main_device] = src1_contiguous;
+        dst_row_extra.data_device[g_main_device]  =  dst_contiguous;
+
+        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+            int64_t num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
  
-        dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
-        dst_row.data = (char *) dst->data + i01*dst->nb[1];
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
  
-        ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
+                                        nb11, cudaMemcpyDeviceToDevice, stream));
+                num_src1_rows++;
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+            src1_row.ne[1] = num_src1_rows;
+            dst_row.ne[1] = num_src1_rows;
+
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+
+            num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
+                                        nb1, cudaMemcpyDeviceToDevice, stream));
+                num_src1_rows++;
+            }
+        }
+
+        ggml_cuda_pool_free(src1_contiguous, as_src1);
+        ggml_cuda_pool_free(dst_contiguous,  as_dst);
      }
  }
  
@@ -9370,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
              return false;
      }
  
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
+        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
+    }
+
      if (params->ith != 0) {
          return true;
      }
author	Johannes Gäßler <redacted>
	Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)
committer	GitHub <redacted>
	Wed, 20 Dec 2023 14:41:22 +0000 (15:41 +0100)