vk::Fence fence, almost_ready_fence;
bool almost_ready_fence_pending {};
+ // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
+ vk_pipeline_struct * prealloc_y_last_pipeline_used {};
+ const ggml_tensor * prealloc_y_last_tensor_used {};
+
vk_buffer buffer_pool[MAX_VK_BUFFERS];
vk_context_ref compute_ctx;
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+ ctx->prealloc_y_last_tensor_used != src1) {
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+ ctx->prealloc_y_last_tensor_used = src1;
+ }
}
if (quantize_y) {
- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
+ if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
+ ctx->prealloc_y_last_tensor_used != src1) {
+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
+ ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
+ ctx->prealloc_y_last_tensor_used = src1;
+ }
}
uint32_t stride_batch_x = ne00*ne01;
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+ ctx->prealloc_y_last_tensor_used != src1) {
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+ ctx->prealloc_y_last_tensor_used = src1;
+ }
}
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+ ctx->prealloc_y_last_tensor_used != src1) {
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+ ctx->prealloc_y_last_tensor_used = src1;
+ }
}
uint32_t stride_batch_x = ne00*ne01;
}
if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
- ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
+ ctx->prealloc_y_last_tensor_used != src1) {
+ ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+ ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
+ ctx->prealloc_y_last_tensor_used = src1;
+ }
}
uint32_t stride_batch_y = ne10*ne11;
GGML_ASSERT(nei0 <= 4096);
const uint32_t split_size = std::min(nei1, 4096u / nei0);
- ggml_tensor src1_copy = *src1;
- ggml_tensor src2_copy = *src2;
- ggml_tensor dst_copy = *dst;
+ if (split_size == nei1) {
+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+ } else {
+ ggml_tensor src1_copy = *src1;
+ ggml_tensor src2_copy = *src2;
+ ggml_tensor dst_copy = *dst;
- for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
- const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
+ for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
+ const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
- src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
- src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
- dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
+ src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
+ src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
+ dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
- src1_copy.ne[2] = n_tokens;
- src2_copy.ne[1] = n_tokens;
- dst_copy.ne[2] = n_tokens;
+ src1_copy.ne[2] = n_tokens;
+ src2_copy.ne[1] = n_tokens;
+ dst_copy.ne[2] = n_tokens;
- ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
+ // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
+ ctx->prealloc_y_last_pipeline_used = {};
+ ctx->prealloc_y_last_tensor_used = nullptr;
+ }
}
}
}
ggml_vk_pool_free(ctx, buffer);
}
ctx->gc.temp_buffers.clear();
+ ctx->prealloc_y_last_pipeline_used = {};
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
ggml_vk_destroy_buffer(ctx->prealloc_x);
ggml_vk_destroy_buffer(ctx->prealloc_y);
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+ ctx->prealloc_y_last_pipeline_used = nullptr;
for (auto& buffer : ctx->buffer_pool) {
ggml_vk_destroy_buffer(buffer);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}
+ ctx->prealloc_y_last_pipeline_used = nullptr;
+ ctx->prealloc_y_last_tensor_used = nullptr;
+
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
const std::array<int64_t, 4> per; // permutation of dimensions
const bool v; // whether a and b are non-contiguous views
+ const uint32_t o; // number of outputs
std::string vars() override {
- return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
+ return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
}
double max_nmse_err() override {
std::array<int64_t, 2> bs = {10, 10},
std::array<int64_t, 2> nr = {2, 2},
std::array<int64_t, 4> per = {0, 1, 2, 3},
- bool v = false)
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
+ bool v = false, uint32_t o = 1)
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
ggml_set_name(out, "out");
+ for (uint32_t i = 1; i < o; ++i) {
+ ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
+ ggml_set_name(out2, "out2");
+ out = ggml_add(ctx, out, out2);
+ }
return out;
}
+
+ bool run_whole_graph() override { return o > 1; }
+
+ std::string op_desc(ggml_tensor * t) override {
+ GGML_UNUSED(t);
+ return ggml_op_name(GGML_OP_MUL_MAT);
+ }
};
// GGML_OP_MUL_MAT_ID
const int64_t m;
const int64_t n;
const int64_t k;
+ const uint32_t o; // number of outputs
std::string vars() override {
- return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
+ return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
}
double max_nmse_err() override {
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
int n_mats = 8, int n_used = 2, bool b = false,
- int64_t m = 32, int64_t n = 32, int64_t k = 32)
+ int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
- m(m), n(n), k(k) {
+ m(m), n(n), k(k), o(o) {
GGML_ASSERT(n_used <= n_mats);
}
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
ggml_set_name(out, "out");
+ for (uint32_t i = 1; i < o; ++i) {
+ ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+ ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
+ ggml_set_name(out2, "out2");
+ out = ggml_add(ctx, out, out2);
+ }
+
return out;
}
}
}
}
+
+ bool run_whole_graph() override { return o > 1; }
+
+ std::string op_desc(ggml_tensor * t) override {
+ GGML_UNUSED(t);
+ return ggml_op_name(GGML_OP_MUL_MAT_ID);
+ }
};
// GGML_OP_OUT_PROD
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
for (auto bs2 : {1,3}) {
for (auto bs : {1,2,4,8}) {
}
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
+ test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {