fix: use `vm_allocate` to allocate CPU backend buffer on macOS (llama/9875)

author Gilad S <redacted>

Wed, 16 Oct 2024 22:36:51 +0000 (01:36 +0300)

committer Georgi Gerganov <redacted>

Wed, 23 Oct 2024 17:28:03 +0000 (20:28 +0300)
author Gilad S <redacted>
Wed, 16 Oct 2024 22:36:51 +0000 (01:36 +0300)
committer Georgi Gerganov <redacted>
Wed, 23 Oct 2024 17:28:03 +0000 (20:28 +0300)
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp

index 15d650150a5f346fae1e8cc38b739c1fdbac0ed6..6d6ffeb4efe1296e80c4913a800ff016e5300a75 100644 (file)
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {
  
  // backend CPU
  
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
  static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
      return "CPU";
  
@@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  }
  
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
+    ggml_aligned_free(buffer->context, buffer->size);
  }
  
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
  }
  
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+    void * data = ggml_aligned_malloc(size);
+
      if (data == NULL) {
          GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
          return NULL;
diff --git a/src/ggml-impl.h b/src/ggml-impl.h

index d3f4bad8c0a84b5ff29f075028296f72e304c1de..65c4f81195be3ebf6bb7bcaf4efbdf56eb2493c0 100644 (file)
--- a/src/ggml-impl.h
+++ b/src/ggml-impl.h
@@ -19,6 +19,9 @@ extern "C" {
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
  
+// required for mmap as gguf only guarantees 32-byte alignment
+#define TENSOR_ALIGNMENT 32
+
  // static_assert should be a #define, but if it's not,
  // fall back to the _Static_assert C11 keyword.
  // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
  
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
  
+// Memory allocation
+
+void * ggml_aligned_malloc(size_t size);
+void ggml_aligned_free(void * ptr, size_t size);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/ggml.c b/src/ggml.c

index d448a61a95942c73982ab6bb5beeafd637304783..fc087c0c07c4754fdfca1d90091c40fa89c095dd 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -35,10 +35,6 @@
  #include <omp.h>
  #endif
  
-#ifdef GGML_USE_METAL
-#include <unistd.h>
-#endif
-
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
  #undef GGML_USE_LLAMAFILE
  #endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
  #endif
  
  #if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
  #include <TargetConditionals.h>
  #endif
  
@@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
  //#define GGML_SOFT_MAX_ACCELERATE
  #endif
  
+
+void * ggml_aligned_malloc(size_t size) {
  #if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
+    return _aligned_malloc(size, TENSOR_ALIGNMENT);
  #else
-inline static void * ggml_aligned_malloc(size_t size) {
      if (size == 0) {
          GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
          return NULL;
      }
      void * aligned_memory = NULL;
  #ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+    int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
+#elif TARGET_OS_OSX
+    kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
+    int result = EFAULT;
+    switch (alloc_status) {
+        case KERN_SUCCESS:
+            result = 0;
+            break;
+        case KERN_INVALID_ADDRESS:
+            result = EINVAL;
+            break;
+        case KERN_NO_SPACE:
+            result = ENOMEM;
+            break;
+        default:
+            result = EFAULT;
+            break;
+    }
  #elif GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+    const long page_size = sysconf(_SC_PAGESIZE);
+    int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
  #else
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+    int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
  #endif
      if (result != 0) {
          // Handle allocation failure
@@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
          return NULL;
      }
      return aligned_memory;
+#endif
  }
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#ifdef GGML_USE_CPU_HBM
-#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+
+void ggml_aligned_free(void * ptr, size_t size) {
+    GGML_UNUSED(size);
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    _aligned_free(ptr);
+#elif GGML_USE_CPU_HBM
+    if (ptr != NULL) {
+        hbw_free(ptr);
+    }
+#elif TARGET_OS_OSX
+    if (ptr != NULL) {
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
+    }
  #else
-#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-#endif
+    free(ptr);
  #endif
+}
+
  
  inline static void * ggml_malloc(size_t size) {
      if (size == 0) {
@@ -3865,7 +3893,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
  
      *ctx = (struct ggml_context) {
          /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
          /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
          /*.no_alloc           =*/ params.no_alloc,
          /*.no_alloc_save      =*/ params.no_alloc,
@@ -3905,7 +3933,7 @@ void ggml_free(struct ggml_context * ctx) {
                      __func__, i, ggml_used_mem(ctx));
  
              if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
+                ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
              }
  
              found = true;
@@ -19604,9 +19632,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
      if (!threadpool) return;
  
+    const int n_threads = threadpool->n_threads_max;
+
  #ifndef GGML_USE_OPENMP
      struct ggml_compute_state* workers = threadpool->workers;
-    const int n_threads = threadpool->n_threads_max;
  
      ggml_mutex_lock(&threadpool->mutex);
  
@@ -19626,8 +19655,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
      ggml_cond_destroy(&threadpool->cond);
  #endif // GGML_USE_OPENMP
  
-    GGML_ALIGNED_FREE(threadpool->workers);
-    GGML_ALIGNED_FREE(threadpool);
+    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
+    ggml_aligned_free(threadpool->workers, workers_size);
+    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
  }
  
  #ifndef GGML_USE_OPENMP
@@ -20059,7 +20089,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
                  struct ggml_cplan * cplan) {
  
      struct ggml_threadpool * threadpool =
-        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
+        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
      {
          threadpool->cgraph           = cgraph;
          threadpool->cplan            = cplan;
@@ -20080,7 +20110,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
  
      // Allocate and init workers state
      const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
-    struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
+    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
  
      memset(workers, 0, workers_size);
      for (int j = 0; j < tpp->n_threads; j++) {
author	Gilad S <redacted>
	Wed, 16 Oct 2024 22:36:51 +0000 (01:36 +0300)
committer	Georgi Gerganov <redacted>
	Wed, 23 Oct 2024 17:28:03 +0000 (20:28 +0300)
src/ggml-backend.cpp		patch \| blob \| history
src/ggml-impl.h		patch \| blob \| history
src/ggml.c		patch \| blob \| history