// mulmat
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
- std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 8, 32, 32, 2, 2, 2, vk_device.subgroup_size };
+ std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size };
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
if (op == GGML_OP_CPY) {
GGML_ASSERT(!transfer_src0);
GGML_ASSERT(!transfer_src1);
- d_sz = dst->ne[1] * dst->nb[1];
+ x_sz = ggml_nbytes(src0);
+ d_sz = ggml_nbytes(dst);
if (extra->offset + d_sz >= d_D->size) {
d_sz = VK_WHOLE_SIZE;
}
ggml_vk_preallocate_buffers();
+ int last_node = cgraph->n_nodes - 1;
+
+ // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
+ while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
+ last_node -= 1;
+ }
+
for (int i = 0; i < cgraph->n_nodes; i++) {
- ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
+ ggml_vk_build_graph(cgraph->nodes[i], i == last_node);
}
ggml_compute_params params = {};