cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy (llama/6208)

author slaren <redacted>

Fri, 22 Mar 2024 13:05:31 +0000 (14:05 +0100)

committer Georgi Gerganov <redacted>

Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)
author slaren <redacted>
Fri, 22 Mar 2024 13:05:31 +0000 (14:05 +0100)
committer Georgi Gerganov <redacted>
Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu

index 14f409eb1dd6aae6f5581101c59b081ecf8dffb9..adf930478f5ff3286b1349fe206c62610f72d88a 100644 (file)
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -771,7 +771,11 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
          if (src_ctx->device == dst_ctx->device) {
              CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
          } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
              CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
+#endif
          }
          CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
          return true;
@@ -11322,19 +11326,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
          GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
          GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
  
-        if (!cuda_ctx_src->copy_event) {
-            ggml_cuda_set_device(cuda_ctx_src->device);
-            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
-        }
-
          // copy on src stream
          if (cuda_ctx_src->device == cuda_ctx_dst->device) {
              CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
          } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
              CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+#endif
          }
  
          // record event on src stream
+        if (!cuda_ctx_src->copy_event) {
+            ggml_cuda_set_device(cuda_ctx_src->device);
+            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
+        }
+
          CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
  
          // wait on dst stream for the copy to complete
@@ -11530,6 +11538,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
  }
  
  static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
+#ifdef GGML_CUDA_NO_PEER_COPY
+    return nullptr;
+#else
      ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
  
      ggml_cuda_set_device(cuda_ctx->device);
@@ -11541,6 +11552,7 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
          /* .backend = */ backend,
          /* .context = */ event,
      };
+#endif
  }
  
  static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
author	slaren <redacted>
	Fri, 22 Mar 2024 13:05:31 +0000 (14:05 +0100)
committer	Georgi Gerganov <redacted>
	Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)