cuda : display total and free VRAM capacity during device initialization (llama/20185)

author Michael Huang <redacted>

Mon, 9 Mar 2026 04:45:43 +0000 (21:45 -0700)

committer Georgi Gerganov <redacted>

Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)
author Michael Huang <redacted>
Mon, 9 Mar 2026 04:45:43 +0000 (21:45 -0700)
committer Georgi Gerganov <redacted>
Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index a8007a06360be1d915a5b87c942534d98ded4a96..0fafaf009318de3385423188b48214c7352a0810 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -205,7 +205,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
      GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
  
      int64_t total_vram = 0;
-    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
+    for (int id = 0; id < info.device_count; ++id) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+        total_vram += prop.totalGlobalMem;
+    }
+    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
+                  __func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
+    total_vram = 0;
  
      std::vector<std::pair<int, std::string>> turing_devices_without_mma;
      for (int id = 0; id < info.device_count; ++id) {
@@ -243,6 +250,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
  #else
          info.devices[id].supports_cooperative_launch = false;
  #endif // !(GGML_USE_MUSA)
+
+        // cudaMemGetInfo returns info for the current device
+        size_t free_mem;
+        CUDA_CHECK(cudaSetDevice(id));
+        CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
+
  #if defined(GGML_USE_HIP)
          info.devices[id].smpbo = prop.sharedMemPerBlock;
  
@@ -257,22 +270,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
                  info.devices[id].cc += prop.minor * 0x10;
              }
          }
-        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
                        id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
-                      device_vmm ? "yes" : "no", prop.warpSize);
+                      device_vmm ? "yes" : "no", prop.warpSize,
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
  #elif defined(GGML_USE_MUSA)
          // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
          info.devices[id].warp_size = 32;
          info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
          info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
          info.devices[id].cc += prop.minor * 0x10;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
+                      id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
  #else
          info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
          info.devices[id].cc = 100*prop.major + 10*prop.minor;
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
+                      id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                      (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
          std::string device_name(prop.name);
          if (device_name == "NVIDIA GeForce MX450") {
              turing_devices_without_mma.push_back({ id, device_name });
author	Michael Huang <redacted>
	Mon, 9 Mar 2026 04:45:43 +0000 (21:45 -0700)
committer	Georgi Gerganov <redacted>
	Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)