llama : add `use_direct_io` flag for model loading (#18166)

author Julius Tischbein <redacted>

Thu, 8 Jan 2026 06:35:30 +0000 (07:35 +0100)

committer GitHub <redacted>

Thu, 8 Jan 2026 06:35:30 +0000 (08:35 +0200)
author Julius Tischbein <redacted>
Thu, 8 Jan 2026 06:35:30 +0000 (07:35 +0100)
committer GitHub <redacted>
Thu, 8 Jan 2026 06:35:30 +0000 (08:35 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index e7966d9d5c86d1ccf9f596277b01f0fcbd991c27..26c790c7e0bb2ec15b0058ace5cba7852c93cb1e 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2088,11 +2088,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      add_opt(common_arg(
          {"--mmap"},
          {"--no-mmap"},
-        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
          [](common_params & params, bool value) {
              params.use_mmap = value;
+            if (value) {
+                params.use_direct_io = false;  // disable direct io when mmap is explicitly enabled
+            }
          }
      ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"-dio", "--direct-io"},
+        {"-ndio", "--no-direct-io"},
+        string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_direct_io = value;
+        }
+    ).set_env("LLAMA_ARG_DIO"));
      add_opt(common_arg(
          {"--numa"}, "TYPE",
          "attempt optimizations that help on some NUMA systems\n"
diff --git a/common/common.cpp b/common/common.cpp

index 41b2b6833ed61270d3d098947d1e6af4f001fdcc..34fa3b5a422890d5b7b52adf59884b80186f49c9 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1366,6 +1366,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
      mparams.use_mmap        = params.use_mmap;
+    mparams.use_direct_io   = params.use_direct_io;
      mparams.use_mlock       = params.use_mlock;
      mparams.check_tensors   = params.check_tensors;
      mparams.use_extra_bufts = !params.no_extra_bufts;
diff --git a/common/common.h b/common/common.h

index d6fd0d37a944bc11fe34ef81e646adb7a39967bc..d55a6b71fb706ffff999069dca2693022458fdb7 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -428,7 +428,8 @@ struct common_params {
      bool kv_unified        = false; // enable unified KV cache
  
      bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mmap          = true;  // enable mmap to use filesystem cache
+    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
      bool use_mlock         = false; // use mlock to keep model in memory
      bool verbose_prompt    = false; // print prompt tokens before generation
      bool display_prompt    = true;  // print prompt before generation
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp

index 273942a165ed0c37822b3c22482405528ecb4fe0..d50f754092d55e44dfd7cbf0c1ed83c03ab9732b 100644 (file)
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -553,6 +553,7 @@ int main(int argc, char ** argv) {
      model_params.n_gpu_layers       = params.n_gpu_layers;
      model_params.devices            = params.devices.data();
      model_params.use_mmap           = params.use_mmap;
+    model_params.use_direct_io      = params.use_direct_io;
      model_params.use_mlock          = params.use_mlock;
      model_params.check_tensors      = params.check_tensors;
  
diff --git a/include/llama.h b/include/llama.h

index 05cb6532542997be33d1359b20d6084075d89a5f..edc4c871a14020d901701cc8f3dd1af62e9a9e00 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -309,6 +309,7 @@ extern "C" {
          // Keep the booleans together to avoid misalignment during copy-by-value.
          bool vocab_only;      // only load the vocabulary, no weights
          bool use_mmap;        // use mmap if possible
+        bool use_direct_io;   // use direct io, takes precedence over use_mmap
          bool use_mlock;       // force system to keep model in RAM
          bool check_tensors;   // validate model tensor data
          bool use_extra_bufts; // use extra buffer types (used for weight repacking)
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp

index 232005e140245288f73771be19d7e36be4aab194..2da857b3aaec22b90782fd1d61afd922af1a45b8 100644 (file)
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -110,7 +110,7 @@ struct llama_file::impl {
          }
      }
  
-    void read_raw(void * ptr, size_t len) const {
+    void read_raw(void * ptr, size_t len) {
          size_t bytes_read = 0;
          while (bytes_read < len) {
              size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +127,7 @@ struct llama_file::impl {
          }
      }
  
-    uint32_t read_u32() const {
+    uint32_t read_u32() {
          uint32_t val;
          read_raw(&val, sizeof(val));
          return val;
@@ -154,8 +154,8 @@ struct llama_file::impl {
          write_raw(&val, sizeof(val));
      }
  
-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
-        throw std::runtime_error("DirectIO is not implemented on Windows.");
+    bool has_direct_io() const {
+        return true;
      }
  
      ~impl() {
@@ -164,33 +164,45 @@ struct llama_file::impl {
          }
      }
  #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
  #ifdef __linux__
          // Try unbuffered I/O for read only
          if (use_direct_io && std::strcmp(mode, "rb") == 0) {
-            fd = open(fname, O_RDONLY | O_DIRECT);
+            if (init_fd()) {
+                return;
+            }
+            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
+                           fname, strerror(errno));
+        }
+#endif
+        init_fp(mode);
+    }
  
-            if (fd != -1) {
-                struct stat file_stats{};
-                fstat(fd, &file_stats);
+#ifdef __linux__
+    bool init_fd() {
+        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
  
-                size = file_stats.st_size;
-                alignment = file_stats.st_blksize;
+        if (fd != -1) {
+            struct stat file_stats{};
+            fstat(fd, &file_stats);
  
-                off_t ret = lseek(fd, 0, SEEK_SET);
-                if (ret == -1) {
-                    throw std::runtime_error(format("seek error: %s", strerror(errno)));
-                }
-                return;
-            }
+            size = file_stats.st_size;
+            alignment = file_stats.st_blksize;
  
-            LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
-                fname, strerror(errno));
+            off_t ret = lseek(fd, 0, SEEK_SET);
+            if (ret == -1) {
+                throw std::runtime_error(format("seek error: %s", strerror(errno)));
+            }
+            return true;
          }
+        return false;
+    }
  #endif
-        fp = ggml_fopen(fname, mode);
+
+    void init_fp(const char * mode) {
+        fp = ggml_fopen(fname.c_str(), mode);
          if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
          }
          seek(0, SEEK_END);
          size = tell();
@@ -226,7 +238,7 @@ struct llama_file::impl {
          }
      }
  
-    void read_raw(void * ptr, size_t len) const {
+    void read_raw_unsafe(void * ptr, size_t len) {
          if (len == 0) {
              return;
          }
@@ -249,6 +261,17 @@ struct llama_file::impl {
                      if (errno == EINTR) {
                          continue;  // Interrupted by signal, retry
                      }
+                    // Fallback to std::fread in case the DMA controller cannot access the buffer
+                    if (errno == EFAULT) {
+                        auto curr_off = tell();
+                        close(fd);
+                        fd = -1;
+                        alignment = 1;
+                        init_fp("rb");
+                        seek(curr_off, SEEK_SET);
+                        read_raw_unsafe(ptr, len);
+                        return;
+                    }
                      throw std::runtime_error(format("read error: %s", strerror(errno)));
                  }
                  if (ret == 0) {
@@ -266,7 +289,8 @@ struct llama_file::impl {
          }
      }
  
-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+    void read_aligned_chunk(void * dest, size_t size) {
+        size_t offset = tell();
          off_t aligned_offset = offset & ~(alignment - 1);
          off_t offset_from_alignment = offset - aligned_offset;
          size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@@ -283,13 +307,21 @@ struct llama_file::impl {
          std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
  
          seek(aligned_offset, SEEK_SET);
-        read_raw(buffer.get(), bytes_to_read);
+        read_raw_unsafe(buffer.get(), bytes_to_read);
  
          uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
          memcpy(dest, reinterpret_cast<void *>(actual_data), size);
      }
  
-    uint32_t read_u32() const {
+    void read_raw(void * ptr, size_t len) {
+        if (has_direct_io()) {
+            read_aligned_chunk(ptr, len);
+        } else {
+            read_raw_unsafe(ptr, len);
+        }
+    }
+
+    uint32_t read_u32() {
          uint32_t ret;
          read_raw(&ret, sizeof(ret));
          return ret;
@@ -310,6 +342,10 @@ struct llama_file::impl {
          write_raw(&val, sizeof(val));
      }
  
+    bool has_direct_io() const {
+        return fd != -1 && alignment > 1;
+    }
+
      ~impl() {
          if (fd != -1) {
              close(fd);
@@ -318,17 +354,9 @@ struct llama_file::impl {
          }
      }
      int fd = -1;
+    std::string fname;
  #endif
  
-    void read_raw_at(void * ptr, size_t len, size_t offset) const {
-        if (alignment != 1) {
-            read_aligned_chunk(offset, ptr, len);
-        } else {
-            seek(offset, SEEK_SET);
-            read_raw(ptr, len);
-        }
-    }
-
      size_t read_alignment() const {
          return alignment;
      }
@@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
  size_t llama_file::size() const { return pimpl->size; }
  
  size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
  
  int llama_file::file_id() const {
  #ifdef _WIN32
@@ -361,10 +390,14 @@ int llama_file::file_id() const {
  }
  
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
-void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
+void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#ifdef _WIN32
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#else
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
+#endif
  
-uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
  
  void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
  void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
diff --git a/src/llama-mmap.h b/src/llama-mmap.h

index 729aac164b838b9ce88e1dc9faa61c9b8522483c..29ce4d246857874b6d05d98ffa5ae0f6614a8ee1 100644 (file)
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -24,15 +24,16 @@ struct llama_file {
  
      void seek(size_t offset, int whence) const;
  
-    void read_raw(void * ptr, size_t len) const;
-    void read_raw_at(void * ptr, size_t len, size_t offset) const;
-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
-    uint32_t read_u32() const;
+    void read_raw(void * ptr, size_t len);
+    void read_raw_unsafe(void * ptr, size_t len);
+    void read_aligned_chunk(void * dest, size_t size);
+    uint32_t read_u32();
  
      void write_raw(const void * ptr, size_t len) const;
      void write_u32(uint32_t val) const;
  
      size_t read_alignment() const;
+    bool has_direct_io() const;
  private:
      struct impl;
      std::unique_ptr<impl> pimpl;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp

index 5003b4fbf5301b46d4d3a4fcc63b96bbb6874474..e66febaa021b6b20c8121b38fb8e6f1c6d4c20c1 100644 (file)
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
          const std::string & fname,
          std::vector<std::string> & splits,
          bool use_mmap,
+        bool use_direct_io,
          bool check_tensors,
          bool no_alloc,
          const llama_model_kv_override * param_overrides_p,
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
      get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
      llm_kv = LLM_KV(llm_arch_from_string(arch_name));
  
-    files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
+    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
      contexts.emplace_back(ctx);
  
+    use_direct_io = use_direct_io && files.back()->has_direct_io();
+
+    // Disable mmap in case Direct I/O is enabled and available
+    if (use_direct_io && use_mmap) {
+        use_mmap = false;
+        LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+    }
+
      // Save tensors data offset of the main file.
      // For subsidiary files, `meta` tensor data offset must not be used,
      // so we build a unified tensors index for weights.
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
                  }
              }
  
-            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
+            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
              contexts.emplace_back(ctx);
  
              // Save tensors data offset info of the shard.
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
      }
  
      this->use_mmap = use_mmap;
+    this->use_direct_io = use_direct_io;
      this->check_tensors = check_tensors;
      this->no_alloc = no_alloc;
  }
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
              const auto & file = files.at(weight->idx);
  
              if (ggml_backend_buffer_is_host(cur->buffer)) {
-                file->read_raw_at(cur->data, n_size, weight->offs);
+                file->seek(weight->offs, SEEK_SET);
+                file->read_raw(cur->data, n_size);
                  if (check_tensors) {
                      validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                          return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
                          ggml_backend_event_synchronize(events[buffer_idx]);
  
                          // Read aligned chunk from file
-                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
  
                          // Calculate actual data portion (excluding alignment padding)
                          uintptr_t ptr_data = ptr_dest_aligned;
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
                      }
                  } else {
                      read_buf.resize(n_size);
-                    file->read_raw_at(read_buf.data(), n_size, weight->offs);
+                    file->seek(weight->offs, SEEK_SET);
+                    file->read_raw(read_buf.data(), n_size);
                      ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                      if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                          throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h

index d13299ad3f12819174826a8e732eee4f08ac5f80..65953dd3d5a6950c38b713d8649dabd4081f1ee7 100644 (file)
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -70,6 +70,7 @@ struct llama_model_loader {
      size_t   n_bytes    = 0;
  
      bool use_mmap = false;
+    bool use_direct_io = false;
      bool check_tensors;
      bool no_alloc;
  
@@ -97,6 +98,7 @@ struct llama_model_loader {
          const std::string & fname,
          std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
          bool use_mmap,
+        bool use_direct_io,
          bool check_tensors,
          bool no_alloc,
          const llama_model_kv_override * param_overrides_p,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 04c48b5fd3f1e47e4f20b57a28d4129325a2f22c..7ac59846bb64a5a95e751b096ac0da22dbafeb63 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2440,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
      const bool use_mmap_buffer = true;
  
-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
+        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
  
      // build a list of buffer types for the CPU and GPU devices
      pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -7973,6 +7974,7 @@ llama_model_params llama_model_default_params() {
          /*.kv_overrides                =*/ nullptr,
          /*.vocab_only                  =*/ false,
          /*.use_mmap                    =*/ true,
+        /*.use_direct_io               =*/ true,
          /*.use_mlock                   =*/ false,
          /*.check_tensors               =*/ false,
          /*.use_extra_bufts             =*/ true,
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index bc4b05c3b50b05023d8ce2af2bca55dc1c24e79a..048d65a75c21a2a955ff068835ad8e74329c2756 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      }
  
      std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
      ml.init_mappings(false); // no prefetching
  
      llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp

index 0162ae8d58c8d4086b4bc5c1dcb994a037ad4993..dfefb3d2b50c599d73c96fc02737551068b03e9a 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -794,7 +794,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
      model.t_start_us = tm.t_start_us;
  
      try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
  
          ml.print_info();
author	Julius Tischbein <redacted>
	Thu, 8 Jan 2026 06:35:30 +0000 (07:35 +0100)
committer	GitHub <redacted>
	Thu, 8 Jan 2026 06:35:30 +0000 (08:35 +0200)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/diffusion/diffusion-cli.cpp		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-mmap.cpp		patch \| blob \| history
src/llama-mmap.h		patch \| blob \| history
src/llama-model-loader.cpp		patch \| blob \| history
src/llama-model-loader.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-quant.cpp		patch \| blob \| history
src/llama.cpp		patch \| blob \| history