cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

author slaren <redacted>

Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)

committer GitHub <redacted>

Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)
author slaren <redacted>
Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)
committer GitHub <redacted>
Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index 5a2701cfeef68696730b7c3c067fb41a60ad221d..c1ec306f0ed4145a8a2362fdbc08aa36e2f1ecd6 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
  }
  
  void * ggml_cuda_host_malloc(size_t size) {
-    void * ptr;
-    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
      return ptr;
  }
  
diff --git a/llama-util.h b/llama-util.h

index ca4dd162f59feb945a52f33c8bc63eeffa0767bf..5f9f70ecc2030473093d9b88159c8ecbc6c6b314 100644 (file)
--- a/llama-util.h
+++ b/llama-util.h
@@ -395,6 +395,8 @@ struct llama_buffer {
      uint8_t * addr = NULL;
      size_t size = 0;
  
+    llama_buffer() = default;
+
      void resize(size_t size) {
          delete[] addr;
          addr = new uint8_t[size];
@@ -404,27 +406,59 @@ struct llama_buffer {
      ~llama_buffer() {
          delete[] addr;
      }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
  };
  
  #ifdef GGML_USE_CUBLAS
  #include "ggml-cuda.h"
  struct llama_ctx_buffer {
      uint8_t * addr = NULL;
+    bool is_cuda;
      size_t size = 0;
  
+    llama_ctx_buffer() = default;
+
      void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
          if (addr) {
-            ggml_cuda_host_free(addr);
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
          }
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
          this->size = size;
      }
  
-    ~llama_ctx_buffer() {
+    void free() {
          if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
          }
+        addr = NULL;
      }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
  };
  #else
  typedef llama_buffer llama_ctx_buffer;
diff --git a/llama.cpp b/llama.cpp

index 3d82113a007a1a493b6fc3386b0a663b0c6efdf5..0d094a52f574ce87486c050053c9b380d1ae9c77 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
              LLAMA_ASSERT(offset == lt.size);
          } else if (lt.split_type == SPLIT_BY_COLUMNS) {
              // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<llama_buffer> tmp_bufs;
-            tmp_bufs.resize(lt.shards.size());
+            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
              for (size_t i = 0; i < lt.shards.size(); i++) {
                  llama_load_tensor_shard & shard = lt.shards.at(i);
                  llama_file & file = file_loaders.at(shard.file_idx)->file;
author	slaren <redacted>
	Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)
committer	GitHub <redacted>
	Mon, 1 May 2023 11:32:22 +0000 (13:32 +0200)
ggml-cuda.cu		patch \| blob \| history
llama-util.h		patch \| blob \| history
llama.cpp		patch \| blob \| history