add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
+ if (value) {
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
+ }
}
).set_env("LLAMA_ARG_MMAP"));
+ add_opt(common_arg(
+ {"-dio", "--direct-io"},
+ {"-ndio", "--no-direct-io"},
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_direct_io = value;
+ }
+ ).set_env("LLAMA_ARG_DIO"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
+ mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
- bool use_mmap = true; // use mmap for faster loads
+ bool use_mmap = true; // enable mmap to use filesystem cache
+ bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.use_mmap = params.use_mmap;
+ model_params.use_direct_io = params.use_direct_io;
model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
+ bool use_direct_io; // use direct io, takes precedence over use_mmap
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
}
}
- void read_raw(void * ptr, size_t len) const {
+ void read_raw(void * ptr, size_t len) {
size_t bytes_read = 0;
while (bytes_read < len) {
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
}
}
- uint32_t read_u32() const {
+ uint32_t read_u32() {
uint32_t val;
read_raw(&val, sizeof(val));
return val;
write_raw(&val, sizeof(val));
}
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
- throw std::runtime_error("DirectIO is not implemented on Windows.");
+ bool has_direct_io() const {
+ return true;
}
~impl() {
}
}
#else
- impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
#ifdef __linux__
// Try unbuffered I/O for read only
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
- fd = open(fname, O_RDONLY | O_DIRECT);
+ if (init_fd()) {
+ return;
+ }
+ LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
+ fname, strerror(errno));
+ }
+#endif
+ init_fp(mode);
+ }
- if (fd != -1) {
- struct stat file_stats{};
- fstat(fd, &file_stats);
+#ifdef __linux__
+ bool init_fd() {
+ fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
- size = file_stats.st_size;
- alignment = file_stats.st_blksize;
+ if (fd != -1) {
+ struct stat file_stats{};
+ fstat(fd, &file_stats);
- off_t ret = lseek(fd, 0, SEEK_SET);
- if (ret == -1) {
- throw std::runtime_error(format("seek error: %s", strerror(errno)));
- }
- return;
- }
+ size = file_stats.st_size;
+ alignment = file_stats.st_blksize;
- LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
- fname, strerror(errno));
+ off_t ret = lseek(fd, 0, SEEK_SET);
+ if (ret == -1) {
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
+ }
+ return true;
}
+ return false;
+ }
#endif
- fp = ggml_fopen(fname, mode);
+
+ void init_fp(const char * mode) {
+ fp = ggml_fopen(fname.c_str(), mode);
if (fp == NULL) {
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+ throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
}
}
- void read_raw(void * ptr, size_t len) const {
+ void read_raw_unsafe(void * ptr, size_t len) {
if (len == 0) {
return;
}
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
+ // Fallback to std::fread in case the DMA controller cannot access the buffer
+ if (errno == EFAULT) {
+ auto curr_off = tell();
+ close(fd);
+ fd = -1;
+ alignment = 1;
+ init_fp("rb");
+ seek(curr_off, SEEK_SET);
+ read_raw_unsafe(ptr, len);
+ return;
+ }
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
}
}
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+ void read_aligned_chunk(void * dest, size_t size) {
+ size_t offset = tell();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
seek(aligned_offset, SEEK_SET);
- read_raw(buffer.get(), bytes_to_read);
+ read_raw_unsafe(buffer.get(), bytes_to_read);
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}
- uint32_t read_u32() const {
+ void read_raw(void * ptr, size_t len) {
+ if (has_direct_io()) {
+ read_aligned_chunk(ptr, len);
+ } else {
+ read_raw_unsafe(ptr, len);
+ }
+ }
+
+ uint32_t read_u32() {
uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
write_raw(&val, sizeof(val));
}
+ bool has_direct_io() const {
+ return fd != -1 && alignment > 1;
+ }
+
~impl() {
if (fd != -1) {
close(fd);
}
}
int fd = -1;
+ std::string fname;
#endif
- void read_raw_at(void * ptr, size_t len, size_t offset) const {
- if (alignment != 1) {
- read_aligned_chunk(offset, ptr, len);
- } else {
- seek(offset, SEEK_SET);
- read_raw(ptr, len);
- }
- }
-
size_t read_alignment() const {
return alignment;
}
size_t llama_file::size() const { return pimpl->size; }
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
int llama_file::file_id() const {
#ifdef _WIN32
}
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
-void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
+void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#ifdef _WIN32
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#else
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
+#endif
-uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
void seek(size_t offset, int whence) const;
- void read_raw(void * ptr, size_t len) const;
- void read_raw_at(void * ptr, size_t len, size_t offset) const;
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
- uint32_t read_u32() const;
+ void read_raw(void * ptr, size_t len);
+ void read_raw_unsafe(void * ptr, size_t len);
+ void read_aligned_chunk(void * dest, size_t size);
+ uint32_t read_u32();
void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;
size_t read_alignment() const;
+ bool has_direct_io() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
+ bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
- files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);
+ use_direct_io = use_direct_io && files.back()->has_direct_io();
+
+ // Disable mmap in case Direct I/O is enabled and available
+ if (use_direct_io && use_mmap) {
+ use_mmap = false;
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ }
+
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
}
}
- files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
}
this->use_mmap = use_mmap;
+ this->use_direct_io = use_direct_io;
this->check_tensors = check_tensors;
this->no_alloc = no_alloc;
}
const auto & file = files.at(weight->idx);
if (ggml_backend_buffer_is_host(cur->buffer)) {
- file->read_raw_at(cur->data, n_size, weight->offs);
+ file->seek(weight->offs, SEEK_SET);
+ file->read_raw(cur->data, n_size);
if (check_tensors) {
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
ggml_backend_event_synchronize(events[buffer_idx]);
// Read aligned chunk from file
- file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+ file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
// Calculate actual data portion (excluding alignment padding)
uintptr_t ptr_data = ptr_dest_aligned;
}
} else {
read_buf.resize(n_size);
- file->read_raw_at(read_buf.data(), n_size, weight->offs);
+ file->seek(weight->offs, SEEK_SET);
+ file->read_raw(read_buf.data(), n_size);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
size_t n_bytes = 0;
bool use_mmap = false;
+ bool use_direct_io = false;
bool check_tensors;
bool no_alloc;
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
+ bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
const bool use_mmap_buffer = true;
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
+ __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
+ /*.use_direct_io =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
}
std::vector<std::string> splits = {};
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());
model.t_start_us = tm.t_start_us;
try {
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+ llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();