// max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
-#define UNUSED(x) (void)(x)
+// create residency sets only on macOS >= 15.0
+#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
+#define GGML_METAL_HAS_RESIDENCY_SETS 1
+#endif
// globals
bool has_simdgroup_reduction;
bool has_simdgroup_mm;
+ bool has_residency_sets;
bool has_bfloat;
bool use_bfloat;
/*.mtl_device_ref_count =*/ 0,
/*.has_simdgroup_reduction =*/ false,
/*.has_simdgroup_mm =*/ false,
+ /*.has_residency_sets =*/ false,
/*.has_bfloat =*/ false,
/*.use_bfloat =*/ false,
/*.name =*/ "",
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+ ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
+#endif
+
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
ctx->queue = [device newCommandQueue];
+ if (ctx->queue == nil) {
+ GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
+ return NULL;
+ }
+
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library;
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
+ GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
int n_buffers;
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+
+ // optional MTLResidencySet
+ id rset;
};
+// rset init
+static bool ggml_backend_metal_buffer_rset_init(
+ struct ggml_backend_metal_buffer_context * ctx,
+ struct ggml_backend_metal_device_context * ctx_dev,
+ id<MTLDevice> device) {
+ ctx->rset = nil;
+
+ if (!ctx_dev->has_residency_sets) {
+ return true;
+ }
+
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+ if (@available(macOS 15.0, *)) {
+ MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
+ desc.label = @"ggml_backend_metal";
+ desc.initialCapacity = ctx->n_buffers;
+
+ NSError * error;
+ ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
+ if (error) {
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+ [desc release];
+ return false;
+ }
+
+ [desc release];
+
+ for (int i = 0; i < ctx->n_buffers; i++) {
+ [ctx->rset addAllocation:ctx->buffers[i].metal];
+ }
+
+ [ctx->rset commit];
+ [ctx->rset requestResidency];
+
+ return true;
+ }
+#else
+ GGML_UNUSED(ctx_dev);
+ GGML_UNUSED(device);
+#endif
+
+ return true;
+}
+
+// rset free
+static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+ if (@available(macOS 15.0, *)) {
+ if (ctx->rset) {
+ [ctx->rset endResidency];
+ [ctx->rset removeAllAllocations];
+ [ctx->rset release];
+ }
+ }
+#else
+ GGML_UNUSED(ctx);
+#endif
+}
+
// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->buffers[i].metal release];
}
+
+ ggml_backend_metal_buffer_rset_free(ctx);
ggml_backend_metal_device_rel(buffer->buft->device->context);
if (ctx->owned) {
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
- UNUSED(buffer);
+ GGML_UNUSED(buffer);
}
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size);
- UNUSED(buffer);
+ GGML_UNUSED(buffer);
}
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size);
- UNUSED(buffer);
+ GGML_UNUSED(buffer);
}
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
}
return false;
- UNUSED(buffer);
+ GGML_UNUSED(buffer);
}
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal";
- UNUSED(buft);
+ GGML_UNUSED(buft);
}
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
}
#endif
#endif
- UNUSED(device);
- UNUSED(size_aligned);
+ GGML_UNUSED(device);
+ GGML_UNUSED(size_aligned);
}
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size_aligned += (size_page - (size_aligned % size_page));
}
- id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
ctx->all_data = ggml_metal_host_malloc(size_aligned);
ctx->all_size = size_aligned;
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx);
- ggml_backend_metal_device_rel(buft->device->context);
+ ggml_backend_metal_device_rel(ctx_dev);
+ return NULL;
+ }
+
+ if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+ free(ctx);
+ ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 32;
- UNUSED(buft);
+ GGML_UNUSED(buft);
}
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
return max_size;
- UNUSED(buft);
+ GGML_UNUSED(buft);
}
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
- UNUSED(buft);
+ GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped";
- UNUSED(buft);
+ GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
size_aligned += (size_page - (size_aligned % size_page));
}
- id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
// the buffer fits into the max buffer size allowed by the device
if (size_aligned <= device.maxBufferLength) {
}
}
+ if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+ free(ctx);
+ ggml_backend_metal_device_rel(ctx_dev);
+ return NULL;
+ }
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
}
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal";
- UNUSED(backend);
+ GGML_UNUSED(backend);
}
static void ggml_backend_metal_free(ggml_backend_t backend) {
}
}
+ if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+ free(ctx);
+ ggml_backend_metal_device_rel(ctx_dev);
+ return NULL;
+ }
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
}
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
- UNUSED(dev);
+ GGML_UNUSED(dev);
}
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {