llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)

author Cebtenzzre <redacted>

Fri, 29 Sep 2023 13:48:45 +0000 (09:48 -0400)

committer GitHub <redacted>

Fri, 29 Sep 2023 13:48:45 +0000 (16:48 +0300)
author Cebtenzzre <redacted>
Fri, 29 Sep 2023 13:48:45 +0000 (09:48 -0400)
committer GitHub <redacted>
Fri, 29 Sep 2023 13:48:45 +0000 (16:48 +0300)
diff --git a/llama.cpp b/llama.cpp

index 666acc21275327c12cf201e0d562322b6ae288c7..bff17135b985fb454648ac7128cd9915965852ff 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          nthread = std::thread::hardware_concurrency();
      }
  
-    llama_model_loader ml(fname_inp, /*use_mmap*/ false);
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
+#if defined(__linux__) || defined(_WIN32)
+    constexpr bool use_mmap = true;
+#else
+    constexpr bool use_mmap = false;
+#endif
+
+    llama_model_loader ml(fname_inp, use_mmap);
+    if (ml.use_mmap) {
+        ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
+    }
  
      llama_model model;
      llm_load_arch(ml, model);
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  
          const std::string name = ggml_get_name(tensor);
  
-        if (read_data.size() < ggml_nbytes(tensor)) {
-            read_data.resize(ggml_nbytes(tensor));
+        if (!ml.use_mmap) {
+            if (read_data.size() < ggml_nbytes(tensor)) {
+                read_data.resize(ggml_nbytes(tensor));
+            }
+            tensor->data = read_data.data();
          }
-        tensor->data = read_data.data();
          ml.load_data_for(tensor);
  
          LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
author	Cebtenzzre <redacted>
	Fri, 29 Sep 2023 13:48:45 +0000 (09:48 -0400)
committer	GitHub <redacted>
	Fri, 29 Sep 2023 13:48:45 +0000 (16:48 +0300)