llama : disable Direct IO by default (#19109)

author Georgi Gerganov <redacted>

Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)

committer GitHub <redacted>

Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)
author Georgi Gerganov <redacted>
Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)
committer GitHub <redacted>
Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index 04fd375d564071fc7f8e86f792fef6fe6ef657f2..2f68bdc1c08aa0722e98721ba91dd2f04a427c56 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2198,18 +2198,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      add_opt(common_arg(
          {"--mmap"},
          {"--no-mmap"},
-        string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
          [](common_params & params, bool value) {
              params.use_mmap = value;
-            if (value) {
-                params.use_direct_io = false;  // disable direct io when mmap is explicitly enabled
-            }
          }
      ).set_env("LLAMA_ARG_MMAP"));
      add_opt(common_arg(
          {"-dio", "--direct-io"},
          {"-ndio", "--no-direct-io"},
-        string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+        string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
          [](common_params & params, bool value) {
              params.use_direct_io = value;
          }
diff --git a/common/common.h b/common/common.h

index 96c990c05d8352d5b859e943933934fef1fbb8f1..21c11f457d4a19c168b50465e36232f262c56fbb 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -438,7 +438,7 @@ struct common_params {
  
      bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
      bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_direct_io     = false; // read from disk without buffering
      bool use_mlock         = false; // use mlock to keep model in memory
      bool verbose_prompt    = false; // print prompt tokens before generation
      bool display_prompt    = true;  // print prompt before generation
diff --git a/include/llama.h b/include/llama.h

index c3360ae57c8120633ab25a27b41b95038acac153..bf4e28a8be16e8b478d985ba5e3a229549c56f85 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -309,7 +309,7 @@ extern "C" {
          // Keep the booleans together to avoid misalignment during copy-by-value.
          bool vocab_only;      // only load the vocabulary, no weights
          bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
+        bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
          bool use_mlock;       // force system to keep model in RAM
          bool check_tensors;   // validate model tensor data
          bool use_extra_bufts; // use extra buffer types (used for weight repacking)
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp

index 383b8dc76188eba2c69fb9d0d21d07e52634d386..1501e392ca84e77207cbed4a6314d521940b25d3 100644 (file)
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -541,15 +541,15 @@ llama_model_loader::llama_model_loader(
  
      if (use_mmap && use_direct_io) {
          if (files.back()->has_direct_io()) {
-            // Disable mmap, as DirectIO is available
-            use_mmap = false;
              LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+            use_mmap = false;
          } else {
-            // Disable DirectIO and reopen file using std::fopen for mmap
+            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
              use_direct_io = false;
+
+            // reopen file using std::fopen for mmap
              files.pop_back();
              files.emplace_back(new llama_file(fname.c_str(), "rb", false));
-            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
          }
      }
  
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index cc784e1cb076a56e777e538cbf7c33ced9be7cf5..72490a89b56c3451f5a0119b96bbfd8443d05b12 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -8125,7 +8125,7 @@ llama_model_params llama_model_default_params() {
          /*.kv_overrides                =*/ nullptr,
          /*.vocab_only                  =*/ false,
          /*.use_mmap                    =*/ true,
-        /*.use_direct_io               =*/ true,
+        /*.use_direct_io               =*/ false,
          /*.use_mlock                   =*/ false,
          /*.check_tensors               =*/ false,
          /*.use_extra_bufts             =*/ true,
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index a2b8d4e56ccd05069dff8ac415a96d74cb0198b8..776222cb6f2573debe7210adca68f3301a04eeab 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -545,7 +545,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      }
  
      std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
      ml.init_mappings(false); // no prefetching
  
      llama_model model(llama_model_default_params());
author	Georgi Gerganov <redacted>
	Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)
committer	GitHub <redacted>
	Wed, 28 Jan 2026 07:11:13 +0000 (09:11 +0200)
common/arg.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-model-loader.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-quant.cpp		patch \| blob \| history