add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
- string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
- if (value) {
- params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
- }
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
- string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+ string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_direct_io = value;
}
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache
- bool use_direct_io = true; // read from disk without buffering for faster model loading
+ bool use_direct_io = false; // read from disk without buffering
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
- bool use_direct_io; // use direct io, takes precedence over use_mmap
+ bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
if (use_mmap && use_direct_io) {
if (files.back()->has_direct_io()) {
- // Disable mmap, as DirectIO is available
- use_mmap = false;
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ use_mmap = false;
} else {
- // Disable DirectIO and reopen file using std::fopen for mmap
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
use_direct_io = false;
+
+ // reopen file using std::fopen for mmap
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
- LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
}
}
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
- /*.use_direct_io =*/ true,
+ /*.use_direct_io =*/ false,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
}
std::vector<std::string> splits = {};
- llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());