* llama-model-loader: use pinned memory for tensor overrides
* change to warning
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
+ if (use_mmap) {
+ static std::once_flag once;
+ std::call_once(once, [] {
+ LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
+ });
+ }
} else {
buft = overrides->buft;
}