CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dGPU in cuda ...

author Shawn yang <redacted>

Sat, 31 May 2025 06:48:04 +0000 (14:48 +0800)

committer Georgi Gerganov <redacted>

Sun, 1 Jun 2025 12:14:44 +0000 (15:14 +0300)
author Shawn yang <redacted>
Sat, 31 May 2025 06:48:04 +0000 (14:48 +0800)
committer Georgi Gerganov <redacted>
Sun, 1 Jun 2025 12:14:44 +0000 (15:14 +0300)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh

index df450b187889baaab973b71e04178336ad32cebd..e1ce1d4cd1558f75c66ef04fab44ce21a8206cfc 100644 (file)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
          int     nsm;                // number of streaming multiprocessors
          size_t  smpb;               // max. shared memory per block
          size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    integrated;         // Device is integrated as opposed to discrete
          bool    vmm;                // virtual memory support
          size_t  vmm_granularity;    // granularity of virtual memory
          size_t  total_vram;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 009ed9048dad3c911829f2763a5c31c96fb1a396..2a6f7f108b3f86128246261a3872db9a700e3666 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
  
          info.default_tensor_split[id] = total_vram;
          total_vram += prop.totalGlobalMem;
-
-        info.devices[id].nsm       = prop.multiProcessorCount;
-        info.devices[id].smpb      = prop.sharedMemPerBlock;
-        info.devices[id].warp_size = prop.warpSize;
+        info.devices[id].integrated = prop.integrated;
+        info.devices[id].nsm        = prop.multiProcessorCount;
+        info.devices[id].smpb       = prop.sharedMemPerBlock;
+        info.devices[id].warp_size  = prop.warpSize;
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
          info.devices[id].smpbo = prop.sharedMemPerBlock;
  
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
      GGML_UNUSED(buft);
  }
  
+static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+}
+
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      CUDA_CHECK(cudaFreeHost(buffer->context));
  }
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
  
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
      bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+    // flag used to determine whether it is an integrated_gpu
+    const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
  
      while (!graph_evaluated_or_captured) {
          // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                      if (node->src[j] != nullptr) {
                          assert(node->src[j]->buffer);
                          assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
-                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
+                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
                      }
                  }
  #endif
@@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
  }
  
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+    const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
+    return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
  }
  
  static int64_t get_op_batch_size(const ggml_tensor * op) {
author	Shawn yang <redacted>
	Sat, 31 May 2025 06:48:04 +0000 (14:48 +0800)
committer	Georgi Gerganov <redacted>
	Sun, 1 Jun 2025 12:14:44 +0000 (15:14 +0300)
ggml/src/ggml-cuda/common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history