GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
+ // integrated GPU device using host memory
+ GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
// all the device properties
struct ggml_backend_dev_props {
+ // device name
const char * name;
+ // device description
const char * description;
+ // device free memory in bytes
size_t memory_free;
+ // device total memory in bytes
size_t memory_total;
+ // device type
enum ggml_backend_dev_type type;
+ // device id
+ // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+ // if the id is unknown, this should be NULL
+ const char * device_id;
+ // device capabilities
struct ggml_backend_dev_caps caps;
};
int device;
std::string name;
std::string description;
+ std::string pci_bus_id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
}
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
+ props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ char pci_bus_id[16] = {};
+ snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
+ dev_ctx->pci_bus_id = pci_bus_id;
+
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ ®,
bool llama_supports_gpu_offload(void) {
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
llama_supports_rpc();
}
model->devices.push_back(*dev);
}
} else {
+ // default device selection
+
+ // build list of available devices
+ std::vector<ggml_backend_dev_t> gpus;
+ std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers;
- // use all available devices
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
// skip CPU backends since they are handled separately
break;
- case GGML_BACKEND_DEVICE_TYPE_GPU:
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
- model->devices.push_back(dev);
+ // check if there is already a GPU with the same device id
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
+ ggml_backend_dev_props d_props;
+ ggml_backend_dev_get_props(d, &d_props);
+ if (props.device_id && d_props.device_id) {
+ return strcmp(props.device_id, d_props.device_id) == 0;
+ }
+ return false;
+ });
+
+ if (it != gpus.end()) {
+ LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
+ __func__,
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+ props.device_id ? props.device_id : "unknown id",
+ ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
+ } else {
+ gpus.push_back(dev);
+ }
}
break;
+ }
+
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
+ igpus.push_back(dev);
+ break;
}
}
- // add RPC servers at the front of the list
- if (!rpc_servers.empty()) {
- model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+
+ // add RPC servers at the front of the list to minimize network transfers
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+
+ // add GPUs
+ model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
+
+ // add integrated GPUs only if no other devices were found
+ if (model->devices.empty()) {
+ model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
}
}
}
for (auto * dev : model->devices) {
- size_t free, total; // NOLINT
- ggml_backend_dev_memory(dev, &free, &total);
- LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+ props.device_id ? props.device_id : "unknown id",
+ props.memory_free/1024/1024);
}
const int status = llama_model_load(path_model, splits, *model, params);
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
- if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
gpu_list.push_back(ggml_backend_dev_description(dev));
}
}
exit(1);
}
}
+ // FIXME: use llama.cpp device selection logic
// add local GPU devices if any
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
case GGML_BACKEND_DEVICE_TYPE_GPU:
devices.push_back(dev);
break;
+
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
+ // iGPUs are not used when there are RPC servers
+ break;
}
}
devices.push_back(nullptr);