const struct llama_model_kv_override * kv_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value.
- bool vocab_only; // only load the vocabulary, no weights
- bool use_mmap; // use mmap if possible
- bool use_mlock; // force system to keep model in RAM
- bool check_tensors; // validate model tensor data
+ bool vocab_only; // only load the vocabulary, no weights
+ bool use_mmap; // use mmap if possible
+ bool use_mlock; // force system to keep model in RAM
+ bool check_tensors; // validate model tensor data
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
}
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
-static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
+static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
buft_list_t buft_list;
// add ACCEL buffer types
}
}
- // add extra buffer types, only if no GPU device is present
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- if (cpu_dev == nullptr) {
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
- }
+ // add extra buffer types
+ if (use_extra_bufts) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
- if (ggml_backend_dev_get_extra_bufts_fn) {
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
- while (extra_bufts && *extra_bufts) {
- buft_list.emplace_back(cpu_dev, *extra_bufts);
- ++extra_bufts;
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+ if (ggml_backend_dev_get_extra_bufts_fn) {
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+ while (extra_bufts && *extra_bufts) {
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
+ ++extra_bufts;
+ }
}
}
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
// build a list of buffer types for the CPU and GPU devices
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
if (std::regex_search(tensor_name, pattern)) {
- buft = overrides->buft;
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
+ // when overriding to a CPU buffer, consider the extra buffer types
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
+ } else {
+ buft = overrides->buft;
+ }
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
tensor_name.c_str(),
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
+ /*.use_extra_bufts =*/ true,
};
#ifdef GGML_USE_METAL