--- /dev/null
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+
+static void print_usage(int /*argc*/, char ** argv) {
+ printf("\nexample usage:\n");
+ printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
+ printf("\n");
+}
+
+int main(int argc, char ** argv) {
+ common_params params;
+
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+ return 1;
+ }
+
+ common_init();
+
+ // init LLM
+
+ llama_backend_init();
+ llama_numa_init(params.numa);
+
+ // initialize the model
+
+ llama_model_params model_params = common_model_params_to_llama(params);
+
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+
+ if (model == NULL) {
+ LOG_ERR("%s: error: unable to load model\n" , __func__);
+ return 1;
+ }
+
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+
+ // we need just a dummy token to evaluate
+ std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
+
+ llama_context_params ctx_params = llama_context_default_params();
+ ctx_params.n_ctx = 512;
+ ctx_params.n_batch = 512;
+ ctx_params.no_perf = false;
+
+ llama_context * ctx = llama_init_from_model(model, ctx_params);
+ if (ctx == NULL) {
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+ return 1;
+ }
+
+ llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+
+ const int n_iters = 3;
+
+ // warm-up
+ llama_decode(ctx, batch);
+ llama_memory_clear(llama_get_memory(ctx), true);
+ llama_synchronize(ctx);
+
+ for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
+ double t_sum_us = 0.0;
+ double t_sum2_us = 0.0;
+
+ for (int i = 0; i < n_iters; i++) {
+ // this pause is important - it simulates "idle GPU"
+ std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
+
+ const int64_t t_start_us = llama_time_us();
+
+ // this should take constant time
+ llama_decode(ctx, batch);
+ llama_synchronize(ctx);
+
+ const int64_t t_end_us = llama_time_us();
+
+ const double t_cur_us = t_end_us - t_start_us;
+
+#if 1
+ // print individual decode times
+ printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
+#endif
+
+ t_sum_us += t_cur_us;
+ t_sum2_us += t_cur_us * t_cur_us;
+
+ llama_memory_clear(llama_get_memory(ctx), true);
+ llama_synchronize(ctx); // just in case
+ }
+
+ const double t_avg_us = t_sum_us / n_iters;
+ const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
+
+ printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
+ fflush(stdout);
+ }
+
+ llama_free(ctx);
+ llama_model_free(model);
+
+ return 0;
+}
};
struct ggml_metal {
- id<MTLDevice> device;
- id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
-
ggml_metal_device_t dev;
ggml_metal_library_t lib;
// init context
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
- res->device = ggml_metal_device_get_obj(dev);
+ id<MTLDevice> device = ggml_metal_device_get_obj(dev);
- GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
+ GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
// TODO: would it be better to have one queue for the backend and one queue for the device?
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
- res->queue = ggml_metal_device_get_queue(dev);
- if (res->queue == nil) {
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
+ if (queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@autoreleasepool {
// wrap the source data into a Metal buffer
- id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+ id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:size
options:MTLResourceStorageModeShared];
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
- id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:buf_src
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@autoreleasepool {
- id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+ id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
- id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
// number of threads in addition to the main thread
const int n_cb = ctx->n_cb;
+ // keep the memory wired
+ ggml_metal_device_rsets_keep_alive(ctx->dev);
+
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
if (!ctx->capture_started) {
// create capture scope
- ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+ ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
descriptor.captureObject = ctx->capture_scope;
}
}
+ // short-hand
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+
// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
{
- id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs[n_cb].obj) {
// prepare the rest of the command buffers asynchronously (optional)
// cmd_buf[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
- id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs[cb_idx].obj) {
}
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
- GGML_ASSERT(ctx->device != nil);
+ GGML_ASSERT(ctx->dev != nil);
+
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
- return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+ return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
#import "ggml-metal-device.h"
#import "ggml-impl.h"
-#import "ggml-threading.h"
#include <Foundation/Foundation.h>
// ref: https://github.com/ggml-org/llama.cpp/pull/15906
id<MTLCommandQueue> mtl_queue;
+ ggml_metal_rsets_t rsets;
+
ggml_metal_library_t library;
struct ggml_metal_device_props props;
};
+//
+// MTLResidenceSet wrapper
+//
+
+struct ggml_metal_rsets {
+ NSLock * lock;
+
+ NSMutableArray * data;
+
+ // number of seconds since the last graph computation
+ // keep the residency sets wired for that amount of time to avoid being collected by the OS
+ int keep_alive_s;
+
+ // background heartbeat thread to keep the residency sets alive
+ atomic_bool d_stop;
+ atomic_int d_loop;
+
+ dispatch_group_t d_group;
+};
+
+ggml_metal_rsets_t ggml_metal_rsets_init(void) {
+ ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
+
+ res->lock = [[NSLock alloc] init];
+ res->data = [[NSMutableArray alloc] init];
+
+ // by default keep the memory wired for 3 minutes
+ res->keep_alive_s = 3*60;
+
+ const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
+ if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
+ res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
+ }
+
+ if (res->keep_alive_s <= 0) {
+ res->keep_alive_s = 3*60;
+ }
+
+ GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
+
+ atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
+ atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
+
+ res->d_group = dispatch_group_create();
+
+ // start a background thread that periodically requests residency for all the currently active sets in the collection
+ // the requests stop after a certain amount of time (keep_alive_s) of inactivity
+ dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
+ dispatch_group_async(res->d_group, d_queue, ^{
+ while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
+ if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
+ [res->lock lock];
+
+ for (int i = 0; i < (int) res->data.count; ++i) {
+ [res->data[i] requestResidency];
+ }
+
+ atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
+
+ [res->lock unlock];
+ }
+
+ // half a second
+ usleep(500 * 1000);
+ }
+ });
+
+ return res;
+}
+
+void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
+ if (rsets == NULL) {
+ return;
+ }
+
+ GGML_ASSERT([rsets->data count] == 0);
+
+ atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
+
+ dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
+ dispatch_release(rsets->d_group);
+
+ [rsets->data release];
+ [rsets->lock release];
+
+ free(rsets);
+}
+
ggml_metal_device_t ggml_metal_device_init(void) {
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
}
+ if (dev->props.use_residency_sets) {
+ dev->rsets = ggml_metal_rsets_init();
+ } else {
+ dev->rsets = nil;
+ }
+
+
// --------------------------------------------------
// print MTL GPU family:
void ggml_metal_device_free(ggml_metal_device_t dev) {
assert(dev != NULL);
+ ggml_metal_rsets_free(dev->rsets);
+
ggml_metal_library_free(dev->library);
dev->library = NULL;
return dev->library;
}
+void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
+ if (rset == nil) {
+ return;
+ }
+
+ GGML_ASSERT(dev->rsets);
+
+ [dev->rsets->lock lock];
+
+ [dev->rsets->data addObject:rset];
+
+ [dev->rsets->lock unlock];
+}
+
+void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
+ if (rset == nil) {
+ return;
+ }
+
+ GGML_ASSERT(dev->rsets);
+
+ [dev->rsets->lock lock];
+
+ [dev->rsets->data removeObject:rset];
+
+ [dev->rsets->lock unlock];
+}
+
+void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
+ if (dev->rsets == NULL) {
+ return;
+ }
+
+ atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
+}
+
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
if (@available(macOS 10.12, iOS 16.0, *)) {
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
id rset;
- // pointers to global device objects
- id<MTLDevice> device;
- id<MTLCommandQueue> queue;
+ // pointers to global device
+ ggml_metal_device_t dev;
};
static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
desc.initialCapacity = buf->n_buffers;
NSError * error;
- buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
+ buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
[desc release];
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
+ res->dev = dev;
+
const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
res->owned = true;
- res->device = ggml_metal_device_get_obj(dev);
- res->queue = ggml_metal_device_get_queue(dev);
-
res->n_buffers = 1;
if (res->all_data != NULL) {
if (size_aligned > 0) {
if (props_dev->use_shared_buffers && shared) {
- res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
} else {
- res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
}
}
return NULL;
}
+ ggml_metal_device_rsets_add(dev, res->rset);
+
//ggml_metal_log_allocated_size(device, size_aligned);
return res;
ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
+ res->dev = dev;
+
res->all_data = ptr;
res->all_size = size;
size_aligned += (size_page - (size_aligned % size_page));
}
- res->device = ggml_metal_device_get_obj(dev);
- res->queue = ggml_metal_device_get_queue(dev);
-
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
// the buffer fits into the max buffer size allowed by the device
res->buffers[res->n_buffers].metal = nil;
if (size_aligned > 0) {
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (res->buffers[res->n_buffers].metal == nil) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
}
}
- ggml_metal_log_allocated_size(res->device, size_aligned);
+ ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
++res->n_buffers;
} else {
res->buffers[res->n_buffers].metal = nil;
if (size_step_aligned > 0) {
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (res->buffers[res->n_buffers].metal == nil) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
}
}
- ggml_metal_log_allocated_size(res->device, size_step_aligned);
+ ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
if (i + size_step < size) {
GGML_LOG_INFO("\n");
return NULL;
}
+ ggml_metal_device_rsets_add(dev, res->rset);
+
return res;
}
void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
+ ggml_metal_device_rsets_rm(buf->dev, buf->rset);
+
for (int i = 0; i < buf->n_buffers; i++) {
[buf->buffers[i].metal release];
}
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
bid_dst.offs += offset;
- id<MTLCommandQueue> queue = buf->queue;
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@autoreleasepool {
// src
void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
- id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
+ id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
- id<MTLCommandQueue> queue = buf->queue;
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
bid_src.offs += offset;
// dst
- id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
+ id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
GGML_ASSERT(buf_dst);
- id<MTLCommandQueue> queue = buf->queue;
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
}
@autoreleasepool {
- id<MTLCommandQueue> queue = buf->queue;
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];