// device description: short informative description of the device, could be the model name
const char * (*get_description)(ggml_backend_dev_t dev);
- // device memory in bytes
+ // device memory in bytes: 0 bytes to indicate no memory to report
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
// device type
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
}
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+
// calculate the split points
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
std::vector<float> splits(n_devices());
size_t total;
size_t free;
ggml_backend_dev_memory(dev, &free, &total);
+
+ // devices can return 0 bytes for free and total memory if they do not
+ // have any to report. in this case, we will use the host memory as a fallback
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+ if (free == 0 && total == 0) {
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
+ }
splits[i] = free;
}
} else {
splits[i] /= split_sum;
}
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- if (cpu_dev == nullptr) {
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
- }
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
}
}
for (size_t i = 0; i < ret.size(); i++) {
- size_t free, total;
+ size_t free;
+ size_t total;
ggml_backend_dev_memory(model->devices[i], &free, &total);
+
+ // devices can return 0 bytes for free and total memory if they do not
+ // have any to report. in this case, we will use the host memory as a fallback
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+ if (free == 0 && total == 0) {
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
+ }
ret[i].free = free;
ret[i].total = total;
}