return ctx->description.c_str();
}
+#if defined(__linux__)
+// Helper function to get available memory from /proc/meminfo for UMA systems
+static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_kb, long * free_swap_kb) {
+ FILE * meminfo_file = nullptr;
+ // 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough
+ const size_t BUFFER_SIZE = 2048;
+ auto file_buffer = std::make_unique<char[]>(BUFFER_SIZE);
+ size_t bytes_read = 0;
+ long huge_tlb_total_pages = -1;
+ long huge_tlb_free_pages = -1;
+ long huge_tlb_page_size = -1;
+
+ if (available_memory_kb == nullptr || free_swap_kb == nullptr) {
+ return false;
+ }
+
+ meminfo_file = fopen("/proc/meminfo", "r");
+ if (meminfo_file == nullptr) {
+ GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__);
+ return false;
+ }
+
+ // Read file into buffer
+ bytes_read = fread(file_buffer.get(), 1, BUFFER_SIZE - 1, meminfo_file);
+ fclose(meminfo_file);
+
+ if (bytes_read == 0) {
+ GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__);
+ return false;
+ }
+ file_buffer[bytes_read] = '\0';
+
+ *available_memory_kb = -1;
+ *free_swap_kb = -1;
+
+ // Parse the file buffer line by line
+ char * line = file_buffer.get();
+ char * line_next;
+ while (line < file_buffer.get() + bytes_read) {
+ // Find the end of the current line
+ line_next = strchr(line, '\n');
+ if (line_next != nullptr) {
+ *line_next = '\0';
+ line_next++;
+ } else {
+ line_next = file_buffer.get() + bytes_read;
+ }
+
+ long value;
+ if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) {
+ *available_memory_kb = value;
+ } else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) {
+ *free_swap_kb = value;
+ } else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) {
+ huge_tlb_total_pages = value;
+ } else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) {
+ huge_tlb_free_pages = value;
+ } else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) {
+ huge_tlb_page_size = value;
+ }
+
+ line = line_next;
+ }
+
+ if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) {
+ *available_memory_kb = huge_tlb_free_pages * huge_tlb_page_size;
+
+ // Hugetlbfs pages are not swappable.
+ *free_swap_kb = 0;
+ }
+
+ GGML_LOG_DEBUG("%s: final available_memory_kb: %ld\n", __func__, *available_memory_kb);
+ return true;
+}
+#endif // defined(__linux__)
+
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemGetInfo(free, total));
+
+// ref: https://github.com/ggml-org/llama.cpp/pull/17368
+#if defined(__linux__)
+ // Check if this is a UMA (Unified Memory Architecture) system
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+
+ // Check if UMA is explicitly enabled via environment variable
+ bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
+ bool is_uma = prop.unifiedAddressing > 0 || uma_env;
+
+ if (is_uma) {
+ // For UMA systems (like DGX Spark), use system memory info
+ long available_memory_kb = 0;
+ long free_swap_kb = 0;
+
+ if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) {
+ *free = (size_t)available_memory_kb * 1024;
+ } else {
+ GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__);
+ }
+ }
+#endif // defined(__linux__)
+
}
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {