metal : use residency sets (llama/11427)

author Georgi Gerganov <redacted>

Sun, 26 Jan 2025 18:06:16 +0000 (20:06 +0200)

committer Georgi Gerganov <redacted>

Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
author Georgi Gerganov <redacted>
Sun, 26 Jan 2025 18:06:16 +0000 (20:06 +0200)
committer Georgi Gerganov <redacted>
Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m

index a85502ee0893b5bb4c33766aa35bffd857df1fd1..c9474345da9715bdf65ea15c2f537c970fc6d7be 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -19,7 +19,10 @@
  // max number of MTLCommandBuffer used to submit a graph for processing
  #define GGML_METAL_MAX_COMMAND_BUFFERS 8
  
-#define UNUSED(x) (void)(x)
+// create residency sets only on macOS >= 15.0
+#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
+#define GGML_METAL_HAS_RESIDENCY_SETS 1
+#endif
  
  // globals
  
@@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context {
  
      bool has_simdgroup_reduction;
      bool has_simdgroup_mm;
+    bool has_residency_sets;
      bool has_bfloat;
      bool use_bfloat;
  
@@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context {
      /*.mtl_device_ref_count    =*/ 0,
      /*.has_simdgroup_reduction =*/ false,
      /*.has_simdgroup_mm        =*/ false,
+    /*.has_residency_sets      =*/ false,
      /*.has_bfloat              =*/ false,
      /*.use_bfloat              =*/ false,
      /*.name                    =*/ "",
@@ -65,6 +70,10 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
  
          ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
  
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+        ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
+#endif
+
          ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
          ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
  
@@ -483,6 +492,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
      GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
  
      ctx->queue  = [device newCommandQueue];
+    if (ctx->queue == nil) {
+        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
+        return NULL;
+    }
+
      ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
  
      id<MTLLibrary> metal_library;
@@ -649,6 +663,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
  
      GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
      GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
+    GGML_LOG_INFO("%s: has residency sets    = %s\n", __func__, ctx_dev->has_residency_sets          ? "true" : "false");
      GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
      GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
      GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@@ -1035,8 +1050,70 @@ struct ggml_backend_metal_buffer_context {
      // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
      int n_buffers;
      struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+
+    // optional MTLResidencySet
+    id rset;
  };
  
+// rset init
+static bool ggml_backend_metal_buffer_rset_init(
+        struct ggml_backend_metal_buffer_context * ctx,
+        struct ggml_backend_metal_device_context * ctx_dev,
+        id<MTLDevice> device) {
+    ctx->rset = nil;
+
+    if (!ctx_dev->has_residency_sets) {
+        return true;
+    }
+
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, *)) {
+        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
+        desc.label = @"ggml_backend_metal";
+        desc.initialCapacity = ctx->n_buffers;
+
+        NSError * error;
+        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
+        if (error) {
+            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            [desc release];
+            return false;
+        }
+
+        [desc release];
+
+        for (int i = 0; i < ctx->n_buffers; i++) {
+            [ctx->rset addAllocation:ctx->buffers[i].metal];
+        }
+
+        [ctx->rset commit];
+        [ctx->rset requestResidency];
+
+        return true;
+    }
+#else
+    GGML_UNUSED(ctx_dev);
+    GGML_UNUSED(device);
+#endif
+
+    return true;
+}
+
+// rset free
+static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, *)) {
+        if (ctx->rset) {
+            [ctx->rset endResidency];
+            [ctx->rset removeAllAllocations];
+            [ctx->rset release];
+        }
+    }
+#else
+    GGML_UNUSED(ctx);
+#endif
+}
+
  // finds the Metal buffer that contains the tensor data on the GPU device
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
  // Metal buffer based on the host memory pointer
@@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
      for (int i = 0; i < ctx->n_buffers; i++) {
          [ctx->buffers[i].metal release];
      }
+
+    ggml_backend_metal_buffer_rset_free(ctx);
      ggml_backend_metal_device_rel(buffer->buft->device->context);
  
      if (ctx->owned) {
@@ -4198,19 +4277,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
  static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
      memset((char *)tensor->data + offset, value, size);
  
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
  }
  
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
      memcpy((char *)tensor->data + offset, data, size);
  
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
  }
  
  static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
      memcpy(data, (const char *)tensor->data + offset, size);
  
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
  }
  
  static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
      }
      return false;
  
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
  }
  
  static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4246,7 +4325,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
  static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
      return "Metal";
  
-    UNUSED(buft);
+    GGML_UNUSED(buft);
  }
  
  static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
      }
  #endif
  #endif
-    UNUSED(device);
-    UNUSED(size_aligned);
+    GGML_UNUSED(device);
+    GGML_UNUSED(size_aligned);
  }
  
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
          size_aligned += (size_page - (size_aligned % size_page));
      }
  
-    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
  
      ctx->all_data = ggml_metal_host_malloc(size_aligned);
      ctx->all_size = size_aligned;
@@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
      if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
          GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
          free(ctx);
-        ggml_backend_metal_device_rel(buft->device->context);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
          return NULL;
      }
  
@@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
  
  static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
      return 32;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
  }
  
  static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
  
      return max_size;
  
-    UNUSED(buft);
+    GGML_UNUSED(buft);
  }
  
  static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
      return true;
  
-    UNUSED(buft);
+    GGML_UNUSED(buft);
  }
  
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
  static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
      return "Metal_Mapped";
  
-    UNUSED(buft);
+    GGML_UNUSED(buft);
  }
  
  static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
          size_aligned += (size_page - (size_aligned % size_page));
      }
  
-    id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
+    struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
  
      // the buffer fits into the max buffer size allowed by the device
      if (size_aligned <= device.maxBufferLength) {
@@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
          }
      }
  
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
      return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
  }
  
@@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
      return "Metal";
  
-    UNUSED(backend);
+    GGML_UNUSED(backend);
  }
  
  static void ggml_backend_metal_free(ggml_backend_t backend) {
@@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
          }
      }
  
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
      return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
  }
  
@@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
      return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
              buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
  
-    UNUSED(dev);
+    GGML_UNUSED(dev);
  }
  
  static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
author	Georgi Gerganov <redacted>
	Sun, 26 Jan 2025 18:06:16 +0000 (20:06 +0200)
committer	Georgi Gerganov <redacted>
	Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)