cuda : fix disabling device with --tensor-split 1,0 (#3951)

author Jared Van Bortel <redacted>

Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)

committer GitHub <redacted>

Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)
author Jared Van Bortel <redacted>
Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)
committer GitHub <redacted>
Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index dc14f2f5d76c669c10052792222c80d8aab97b80..9f873035ad0c054b5d716965485236ab88037656 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
      int64_t  row_low[GGML_CUDA_MAX_DEVICES];
      int64_t row_high[GGML_CUDA_MAX_DEVICES];
  
+    int used_devices = 0;
+
      for (int64_t id = 0; id < g_device_count; ++id) {
          // by default, use all rows
          row_low[id]  = 0;
@@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
              continue;
          }
  
+        used_devices++;
+
          const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
          const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
  
@@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
  
      // if multiple devices are used they need to wait for the main device
      // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && g_device_count > 1) {
+    if (split && used_devices > 1) {
          CUDA_CHECK(ggml_cuda_set_device(g_main_device));
          CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
      }
  
-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
      for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
          const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
          const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
      }
  
      for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
          CUDA_CHECK(ggml_cuda_set_device(id));
  
          // free buffers again when done
@@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
  
          CUDA_CHECK(ggml_cuda_set_device(g_main_device));
          for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
              for (int64_t is = 0; is < is_max; ++is) {
                  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
              }
@@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
  
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
          (src1->backend == GGML_BACKEND_GPU) &&
          ( dst->backend == GGML_BACKEND_GPU);
author	Jared Van Bortel <redacted>
	Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)
committer	GitHub <redacted>
	Sun, 5 Nov 2023 15:08:57 +0000 (10:08 -0500)