From: Johannes Gäßler Date: Sat, 29 Jul 2023 21:04:10 +0000 (+0200) Subject: CUDA: faster multi GPU synchronization (#2448) X-Git-Tag: gguf-v0.4.0~379 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=9baf9ef304f330009d5a93b7390280a0fd27c9a1;p=pkg%2Fggml%2Fsources%2Fllama.cpp CUDA: faster multi GPU synchronization (#2448) --- diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d31fc79c..511f48c0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3529,13 +3529,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. - // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. // Instead they need to be copied to the correct slice in ne0 = dst row index. // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - for (int64_t j = 0; j < ne1; ++j) { - float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main)); - } + float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float), + i01_diff*sizeof(float), ne1, kind, cudaStream_main)); } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));