#include <Metal/Metal.h>
+#include <stdatomic.h>
+
#ifndef TARGET_OS_VISION
#define TARGET_OS_VISION 0
#endif
// overload of MTLGPUFamilyMetal3 (not available in some environments)
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+// virtual address for GPU memory allocations
+static atomic_uintptr_t g_addr_device = 0x000000400ULL;
+
#if !GGML_METAL_EMBED_LIBRARY
// Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
};
struct ggml_metal_buffer {
- void * all_data; // TODO: https://github.com/ggml-org/llama.cpp/pull/15985
+ void * all_data;
size_t all_size;
// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
if (shared) {
res->all_data = ggml_metal_host_malloc(size_aligned);
res->is_shared = true;
- res->owned = true;
} else {
- // dummy, non-NULL value - we'll populate this after creating the Metal buffer below
- res->all_data = (void *) 0x000000400ULL;
+ // use virtual address from g_addr_device counter
+ res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
res->is_shared = false;
}
res->all_size = size_aligned;
+ res->owned = true;
+
res->device = ggml_metal_device_get_obj(dev);
res->queue = ggml_metal_device_get_queue(dev);
res->buffers[0].metal = nil;
if (size_aligned > 0) {
- if (props_dev->use_shared_buffers &&shared) {
+ if (props_dev->use_shared_buffers && shared) {
res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
} else {
res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
-
- res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
}
}
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
if (buf->is_shared) {
- memset((char *)tensor->data + offset, value, size);
+ memset((char *) tensor->data + offset, value, size);
return;
}
void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (buf->is_shared) {
- memcpy((char *)tensor->data + offset, data, size);
+ memcpy((char *) tensor->data + offset, data, size);
return;
}
void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
if (buf->is_shared) {
- memcpy(data, (const char *)tensor->data + offset, size);
+ memcpy(data, (const char *) tensor->data + offset, size);
return;
}
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
// end of mrope
- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);