vk::PhysicalDeviceProperties properties;
std::string name;
uint64_t max_memory_allocation_size;
+ uint64_t max_buffer_size;
uint64_t suballocation_block_size;
bool fp16;
bool bf16;
static void ggml_backend_vk_free(ggml_backend_t backend);
+static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) {
+ const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset},
+ VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
+ return range;
+}
+
// Wait for ctx->fence to be signaled.
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) {
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
- if (size > device->max_memory_allocation_size) {
- throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
+ if (size > device->max_buffer_size) {
+ throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
}
vk_buffer buf = std::make_shared<vk_buffer_struct>();
buf.reset();
}
-static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
- return { buf, 0, VK_WHOLE_SIZE };
+static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) {
+ return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) };
}
static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
- device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
+ device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
} else if (maintenance4_support) {
device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
} else {
device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
}
+ const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE");
+
+ if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) {
+ device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE);
+ } else if (maintenance4_support) {
+ device->max_buffer_size = props4.maxBufferSize;
+ } else {
+ device->max_buffer_size = device->max_memory_allocation_size;
+ }
+
const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
- device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+ device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
} else {
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
device->suballocation_block_size = 1024*1024*1024;
}
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
if (
- (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
- (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
- (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
+ (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+ (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+ (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) {
GGML_ABORT("Requested preallocation size is too large");
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
}
if (x_non_contig) {
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+ ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
}
- // No bounds checking is needed for dst. This is basically VK_WHOLE_SIZE but clamped to maxStorageBufferRange.
- VkDeviceSize d_range = std::min(VkDeviceSize{d_D->size - d_buf_offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
-
// compute
ggml_vk_matmul(
ctx, subctx, pipeline,
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total },
- { d_D, d_buf_offset, d_range }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
+ ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
ne01, ne11, ne10,
ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d,
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
}
if (
- (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
- (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+ (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+ (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
GGML_ABORT("Requested preallocation size is too large");
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
}
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
+ ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (
- (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
- (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+ (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+ (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
GGML_ABORT("Requested preallocation size is too large");
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
}
if (x_non_contig) {
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
if (
- (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
- (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+ (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
+ (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
GGML_ABORT("Requested preallocation size is too large");
}
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
if (x_non_contig) {
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
ctx->prealloc_y_last_tensor_used = src1;
}
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
- if (split_k_size > ctx->device->max_memory_allocation_size) {
+ if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
GGML_ABORT("Requested preallocation size is too large");
}
if (ctx->prealloc_size_split_k < split_k_size) {
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
- vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
+ ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
+ ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
+ ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
+ ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
+ ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+ ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
},
// We only use split_k when group query attention is enabled, which means
// there's no more than one tile of rows (i.e. workgroups_x would have been
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
{
- vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
- vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+ ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
+ ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+ ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
},
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
ctx->prealloc_split_k_need_sync = true;
} else {
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
- vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
- vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+ ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
+ ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
+ ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
+ ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
+ ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
+ ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
},
pc, { workgroups_x, workgroups_y, workgroups_z });
}
}
}
- uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
- uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
- uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
- uint64_t d_sz = ggml_type_size(dst->type) * ned;
-
vk_buffer d_D = dst_buf_ctx->dev_buffer;
- // Workaround for tiny tensor inputs on ROPE
- if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
- y_sz = VK_WHOLE_SIZE;
- }
-
GGML_ASSERT(d_D != nullptr);
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
if(!src0_uma) {
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
- if (op_supports_incontiguous) {
- x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
- y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
- z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
- d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
-
- if (x_buf_offset + x_sz >= d_X->size) {
- x_sz = VK_WHOLE_SIZE;
- }
- if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
- y_sz = VK_WHOLE_SIZE;
- }
- if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
- z_sz = VK_WHOLE_SIZE;
- }
- if (d_buf_offset + d_sz >= d_D->size) {
- d_sz = VK_WHOLE_SIZE;
- }
- }
-
std::array<uint32_t, 3> elements;
// Single call if dimension 2 is contiguous
break;
}
- if (!op_supports_incontiguous) {
- if (x_sz != VK_WHOLE_SIZE) {
- x_sz *= ne02 * ne03;
+ uint64_t x_sz, y_sz, z_sz, d_sz;
+
+ if (op_supports_incontiguous) {
+ x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
+ y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
+ z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
+ d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
+
+ if (x_buf_offset + x_sz >= d_X->size) {
+ x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset);
}
- if (use_src1 && y_sz != VK_WHOLE_SIZE) {
- y_sz *= ne12 * ne13;
+ if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
+ y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset);
}
- if (use_src2 && z_sz != VK_WHOLE_SIZE) {
- z_sz *= ne22 * ne23;
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
+ z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
}
- if (d_sz != VK_WHOLE_SIZE) {
- d_sz *= ned2 * ned3;
+ if (d_buf_offset + d_sz >= d_D->size) {
+ d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
}
+ } else {
+ x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
+ y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
+ z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
+ d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
}
if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
{ vk_subbuffer{ d_X, x_buf_offset, x_sz },
vk_subbuffer{ d_Y, y_buf_offset, y_sz },
vk_subbuffer{ d_D, d_buf_offset, d_sz },
- vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE },
+ ggml_vk_subbuffer(ctx, d_A, a_buf_offset),
}, pc, elements);
} else if (op == GGML_OP_GLU) {
// Empty src1 is possible in glu, but the shader needs a buffer
static_assert(MAX_PARAMETER_COUNT == 12);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
- vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE },
- vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE },
+ ggml_vk_subbuffer(ctx, buf[0], offset[0]),
+ ggml_vk_subbuffer(ctx, buf[1], offset[1]),
+ ggml_vk_subbuffer(ctx, buf[2], offset[2]),
+ ggml_vk_subbuffer(ctx, buf[3], offset[3]),
+ ggml_vk_subbuffer(ctx, buf[4], offset[4]),
+ ggml_vk_subbuffer(ctx, buf[5], offset[5]),
+ ggml_vk_subbuffer(ctx, buf[6], offset[6]),
+ ggml_vk_subbuffer(ctx, buf[7], offset[7]),
+ ggml_vk_subbuffer(ctx, buf[8], offset[8]),
+ ggml_vk_subbuffer(ctx, buf[9], offset[9]),
+ ggml_vk_subbuffer(ctx, buf[10], offset[10]),
+ ggml_vk_subbuffer(ctx, buf[11], offset[11]),
}, pc, elements);
}
ggml_vk_ctx_begin(ctx->device, subctx);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_matmul(
- ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
+ ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k),
m, n, k,
k, k, m, k*m, k*n, m*n,
split_k, batch, batch, batch, 1, 1, n
//
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
// ggml_vk_ctx_begin(ctx->device, subctx);
-// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
+// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne);
// ggml_vk_ctx_end(subctx);
//
// auto begin = std::chrono::high_resolution_clock::now();