metal : use shared buffers between CPU and GPU (#1696)

author kiltyj <redacted>

Mon, 5 Jun 2023 20:24:04 +0000 (13:24 -0700)

committer GitHub <redacted>

Mon, 5 Jun 2023 20:24:04 +0000 (23:24 +0300)
author kiltyj <redacted>
Mon, 5 Jun 2023 20:24:04 +0000 (13:24 -0700)
committer GitHub <redacted>
Mon, 5 Jun 2023 20:24:04 +0000 (23:24 +0300)
diff --git a/ggml-metal.m b/ggml-metal.m

index 3cb423a01f550894c39daabfd58d4dae107e4485..82c65963b989dd1fa37669788e397f72c5adc6c0 100644 (file)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
              }
          }
  
+        size_t page_size = getpagesize();
+        size_t aligned_size = size;
+        if ((aligned_size % page_size) != 0) {
+            aligned_size += (page_size - (aligned_size % page_size));
+        }
+
          ctx->buffers[ctx->n_buffers].name = name;
          ctx->buffers[ctx->n_buffers].data = data;
          ctx->buffers[ctx->n_buffers].size = size;
-        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
+        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
  
-        ++ctx->n_buffers;
+        if (ctx->buffers[ctx->n_buffers].metal == nil) {
+            fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+            return false;
+        } else {
+            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+        }
  
-        fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
+        ++ctx->n_buffers;
      }
  
      return true;
diff --git a/ggml.c b/ggml.c

index 24f0d2fad19d4356d68c98a34b0e8feb2fa31070..4e3e7edb98b39ad5485d151acb4590e369977896 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -22,6 +22,10 @@
  #include <float.h>
  #include <limits.h>
  
+#ifdef GGML_USE_METAL
+#include <unistd.h>
+#endif
+
  // if C99 - static_assert is noop
  // ref: https://stackoverflow.com/a/53923785/4039976
  #ifndef static_assert
@@ -122,7 +126,11 @@ typedef void* thread_ret_t;
  #else
  inline static void* ggml_aligned_malloc(size_t size) {
      void* aligned_memory = NULL;
+#ifdef GGML_USE_METAL
+    int result = posix_memalign(&aligned_memory, getpagesize(), size);
+#else
      int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+#endif
      if (result != 0) {
          // Handle allocation failure
          return NULL;
diff --git a/llama-util.h b/llama-util.h

index 3cac9f681800bcd83962a603ca04bf79f69f4b9f..4f8a4296adc4eaf3d66cd453097cd2ab82db72c9 100644 (file)
--- a/llama-util.h
+++ b/llama-util.h
@@ -405,13 +405,29 @@ struct llama_buffer {
      llama_buffer() = default;
  
      void resize(size_t len) {
+#ifdef GGML_USE_METAL
+        free(addr);
+        int result = posix_memalign((void **) &addr, getpagesize(), len);
+        if (result == 0) {
+            memset(addr, 0, len);
+        }
+        else {
+            addr = NULL;
+        }
+#else
          delete[] addr;
          addr = new uint8_t[len];
+#endif
          size = len;
      }
  
      ~llama_buffer() {
+#ifdef GGML_USE_METAL
+        free(addr);
+#else
          delete[] addr;
+#endif
+        addr = NULL;
      }
  
      // disable copy and move
diff --git a/llama.cpp b/llama.cpp

index e2511e533c1100cd7b680d9aa0ef666c4bf1e508..d0e7151f47eceb0e8c6b4616902d2891b36de97b 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -53,7 +53,6 @@ enum e_model {
      MODEL_65B,
  };
  
-
  static const size_t MB = 1024*1024;
  
  // computed for n_ctx == 2048
@@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
      ggml_set_name(embd, "embd");
      memcpy(embd->data, tokens, N*ggml_element_size(embd));
  
-#ifdef GGML_USE_METAL
-    if (lctx.ctx_metal && N == 1) {
-        ggml_metal_set_tensor(lctx.ctx_metal, embd);
-    }
-#endif
-
      struct ggml_tensor * cur;
      struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
  
@@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
          }
  
          ggml_graph_compute(ctx0, &gf);
-
-        if (lctx.ctx_metal) {
-            // We need to sync the CPU KV cache with the GPU KV cache
-            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
-            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
-        }
      }
  #else
      ggml_graph_compute(ctx0, &gf);
author	kiltyj <redacted>
	Mon, 5 Jun 2023 20:24:04 +0000 (13:24 -0700)
committer	GitHub <redacted>
	Mon, 5 Jun 2023 20:24:04 +0000 (23:24 +0300)
ggml-metal.m		patch \| blob \| history
ggml.c		patch \| blob \| history
llama-util.h		patch \| blob \| history
llama.cpp		patch \| blob \| history