CUDA: faster multi GPU synchronization (#2448)

author Johannes Gäßler <redacted>

Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)

committer GitHub <redacted>

Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)
author Johannes Gäßler <redacted>
Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)
committer GitHub <redacted>
Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index d31fc79c10961de17704b317b7489f7a0c5e1c9e..511f48c0a31a62cf609ce274c47dca72300236fa 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3529,13 +3529,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
                      if (split) {
                          // src0 = weight matrix is saved as a transposed matrix for better memory layout.
                          // dst is NOT transposed.
-                        // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
                          // Instead they need to be copied to the correct slice in ne0 = dst row index.
                          // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        for (int64_t j = 0; j < ne1; ++j) {
-                            float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
-                            CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
-                        }
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
+                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
+                                                     i01_diff*sizeof(float), ne1, kind, cudaStream_main));
                      } else {
                          float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                          CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
author	Johannes Gäßler <redacted>
	Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)
committer	GitHub <redacted>
	Sat, 29 Jul 2023 21:04:10 +0000 (23:04 +0200)