GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32, mul_mm_mxfp4_f32, has_simdgroup_mm);
- GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32, mul_mm_mxfp4_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, has_simdgroup_mm);
NSString * key = [NSString stringWithUTF8String:name];
[ctx->kernels_ext setObject:obj forKey:key];
+ [metal_function release];
+ [obj release];
+
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) kernel.pipeline,
(int) kernel.pipeline.maxTotalThreadsPerThreadgroup,
(int) kernel.pipeline.threadExecutionWidth);
char name[256];
@autoreleasepool {
- MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
const int32_t dk = (int32_t) op->src[1]->ne[0];
const int32_t dv = (int32_t) op->src[2]->ne[0];
return res;
}
- cv = [[MTLFunctionConstantValues alloc] init];
+ MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
[cv setConstantValue:&has_mask type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT + 0];
[cv setConstantValue:&has_sinks type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT + 1];
[cv setConstantValue:&ns20 type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT + 21];
[cv setConstantValue:&nsg type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT + 22];
- return ggml_metal_compile_kernel(backend, base, name, cv);
+ res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+ [cv release];
+
+ return res;
}
}
char name[256];
@autoreleasepool {
- MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
const int32_t dk = (int32_t) op->src[1]->ne[0];
const int32_t dv = (int32_t) op->src[2]->ne[0];
return res;
}
- cv = [[MTLFunctionConstantValues alloc] init];
+ MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
[cv setConstantValue:&has_mask type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT_VEC + 0];
[cv setConstantValue:&has_sinks type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT_VEC + 1];
[cv setConstantValue:&nsg type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC + 22];
[cv setConstantValue:&nwg type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC + 23];
- return ggml_metal_compile_kernel(backend, base, name, cv);
+ res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+ [cv release];
+
+ return res;
}
}
char name[256];
@autoreleasepool {
- MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
snprintf(base, 256, "kernel_flash_attn_ext_vec_reduce");
snprintf(name, 256, "kernel_flash_attn_ext_vec_reduce_dv=%d_nwg=%d", dv, nwg);
return res;
}
- cv = [[MTLFunctionConstantValues alloc] init];
+ MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
[cv setConstantValue:&dv type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC_REDUCE + 0];
[cv setConstantValue:&nwg type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC_REDUCE + 1];
- return ggml_metal_compile_kernel(backend, base, name, cv);
+ res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+ [cv release];
+
+ return res;
}
GGML_UNUSED(op);
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
[cmd_buf retain];
+ if (ctx->cmd_bufs[n_cb].obj) {
+ [ctx->cmd_bufs[n_cb].obj release];
+ }
ctx->cmd_bufs[n_cb].obj = cmd_buf;
[cmd_buf enqueue];