Fixed mmap prefetch for GPU offloading (#2529)

author Johannes Gäßler <redacted>

Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)

committer GitHub <redacted>

Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)
author Johannes Gäßler <redacted>
Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)
committer GitHub <redacted>
Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)
diff --git a/llama-util.h b/llama-util.h

index 3fc03ce28273ed4ed2a09647160152c68e31ea8b..6e9e39ddb6f5804e7dd706404c19d8a50d0f062f 100644 (file)
--- a/llama-util.h
+++ b/llama-util.h
@@ -219,7 +219,7 @@ struct llama_mmap {
          // prefetch/readahead impairs performance on NUMA systems
          if (numa) { prefetch = 0; }
  #ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
+        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
  #endif
          addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
          if (addr == MAP_FAILED) {
diff --git a/llama.cpp b/llama.cpp

index 839739870eb3e97a72fd56e68f4377a3639ac4b9..39aefd499dd0c50cc9b52361bd8b7d652b837e12 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -747,12 +747,12 @@ struct llama_model_loader {
  
      void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
          size_t data_size = 0;
-        size_t prefetch_size = 0;
+        size_t prefetch_size = file_loader->file.size;
          size_t lock_size = 0;
          for (const llama_load_tensor & lt : tensors_map.tensors) {
              data_size += lt.size;
-            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
-                prefetch_size += lt.size;
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+                prefetch_size -= lt.size;
              }
          }
author	Johannes Gäßler <redacted>
	Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)
committer	GitHub <redacted>
	Mon, 7 Aug 2023 08:09:40 +0000 (10:09 +0200)
llama-util.h		patch \| blob \| history
llama.cpp		patch \| blob \| history