llama: use host memory if device reports 0 memory (#18587)

author Aaron Teo <redacted>

Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)

committer GitHub <redacted>

Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)
author Aaron Teo <redacted>
Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)
committer GitHub <redacted>
Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h

index 6792ba986e8ed5041de450ddaab7fe2ba6af8446..59190b7c4656d8cd55d649791f3417a875a2d94b 100644 (file)
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -144,7 +144,7 @@ extern "C" {
          // device description: short informative description of the device, could be the model name
          const char * (*get_description)(ggml_backend_dev_t dev);
  
-        // device memory in bytes
+        // device memory in bytes: 0 bytes to indicate no memory to report
          void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
  
          // device type
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp

index 472e2df50a4d1f59cbfc8fa5bec402ecc9858d76..e50ca8e0f2700797cf33cf3c454a8ef1dfeb1e80 100644 (file)
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4287,8 +4287,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
  }
  
  static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    *free = 1;
-    *total = 1;
+    *free = 0;
+    *total = 0;
  
      GGML_UNUSED(dev);
  }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 7ac59846bb64a5a95e751b096ac0da22dbafeb63..5de6493b9e987af2e1be7d4b50d4529aa332e27b 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2452,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
          pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
      }
  
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
+
      // calculate the split points
      bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
      std::vector<float> splits(n_devices());
@@ -2462,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              size_t total;
              size_t free;
              ggml_backend_dev_memory(dev, &free, &total);
+
+            // devices can return 0 bytes for free and total memory if they do not
+            // have any to report. in this case, we will use the host memory as a fallback
+            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+            if (free == 0 && total == 0) {
+                ggml_backend_dev_memory(cpu_dev, &free, &total);
+            }
              splits[i] = free;
          }
      } else {
@@ -2478,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
          splits[i] /= split_sum;
      }
  
-    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
      const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
      const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
      auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
diff --git a/src/llama.cpp b/src/llama.cpp

index 33f51a238901a40e2c6d0846f6983e3b49685393..f1096d960e130ce83140157d020b7af191fd8a5c 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
          }
      }
      for (size_t i = 0; i < ret.size(); i++) {
-        size_t free, total;
+        size_t free;
+        size_t total;
          ggml_backend_dev_memory(model->devices[i], &free, &total);
+
+        // devices can return 0 bytes for free and total memory if they do not
+        // have any to report. in this case, we will use the host memory as a fallback
+        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        if (free == 0 && total == 0) {
+            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (cpu_dev == nullptr) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            }
+            ggml_backend_dev_memory(cpu_dev, &free, &total);
+        }
          ret[i].free  = free;
          ret[i].total = total;
      }
author	Aaron Teo <redacted>
	Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)
committer	GitHub <redacted>
	Thu, 8 Jan 2026 21:34:56 +0000 (05:34 +0800)
ggml/src/ggml-backend-impl.h		patch \| blob \| history
ggml/src/ggml-opencl/ggml-opencl.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama.cpp		patch \| blob \| history