struct ggml_backend_vk_context;
#define MAX_PARAMETER_COUNT 8
+// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT.
+#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 2)
struct vk_pipeline_struct {
std::string name;
bool float_controls_rte_fp16;
bool subgroup_add;
bool subgroup_shuffle;
+ bool multi_add;
bool integer_dot_product;
vk_pipeline pipeline_div[2][2][2];
vk_pipeline pipeline_div_norepeat[2][2][2];
+ // indexed by num_additional_fused_ops == num_adds - 1
+ vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS];
+
vk_pipeline pipeline_add_id_f32;
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
float param1; float param2; int32_t param3;
};
+struct vk_op_multi_add_push_constants {
+ // shape for dst
+ uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23;
+
+ // strides for srcs+dst
+ uint32_t nb[8][4];
+};
+
struct vk_op_add_id_push_constants {
uint32_t ne0;
uint32_t ne1;
CREATE_BINARY(div, _norepeat, {1})
#undef CREATE_BINARY
+ if (device->multi_add) {
+ for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) {
+ ggml_vk_create_pipeline(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1);
+ }
+ }
+
ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
+ device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 &&
+ device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) &&
+ vk12_features.runtimeDescriptorArray &&
+ device->vendor_id != VK_VENDOR_ID_INTEL &&
+ getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr;
+
if (device->subgroup_size_control) {
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
switch (op) {
case GGML_OP_ADD:
{
+ if (ctx->num_additional_fused_ops > 0) {
+ return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops];
+ }
auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add;
return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16];
}
}, dryrun);
}
+static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
+ const ggml_tensor *first_node = cgraph->nodes[node_idx];
+ const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
+
+ // Make a list of all the tensors used by the op.
+ // Last element of the list is the dest tensor.
+ const ggml_tensor *tensors[MAX_PARAMETER_COUNT];
+ uint32_t num_srcs = ctx->num_additional_fused_ops + 2;
+ uint32_t num_tensors = num_srcs + 1;
+ GGML_ASSERT(num_tensors <= MAX_PARAMETER_COUNT);
+
+ tensors[0] = first_node->src[0];
+ tensors[1] = first_node->src[1];
+ for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) {
+ // check whether the previous result is src[0] or src[1]
+ if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) {
+ tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1];
+ } else {
+ tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0];
+ }
+ }
+ tensors[num_srcs] = dst;
+
+ vk_op_multi_add_push_constants pc;
+ pc.ne20 = (uint32_t)dst->ne[0];
+ pc.ne21 = (uint32_t)dst->ne[1];
+ pc.ne22 = (uint32_t)dst->ne[2];
+ pc.ne23 = (uint32_t)dst->ne[3];
+
+ for (uint32_t i = 0; i < num_tensors; ++i) {
+ const ggml_tensor *t = tensors[i];
+ pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float);
+ pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float);
+ pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float);
+ pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float);
+ }
+
+ vk_pipeline pipeline = ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops];
+
+ if (pipeline == nullptr) {
+ std::cerr << "ggml_vulkan: Error: Missing multi_add";
+ GGML_ABORT("fatal error");
+ }
+
+ if (dryrun) {
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+ return;
+ }
+
+ ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT];
+ vk_buffer buf[MAX_PARAMETER_COUNT];
+ size_t offset[MAX_PARAMETER_COUNT];
+ bool uma[MAX_PARAMETER_COUNT];
+
+ for (uint32_t i = 0; i < num_tensors; ++i) {
+ buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context;
+ buf[i] = nullptr;
+ offset[i] = 0;
+ uma[i] = false;
+
+ if (ctx->device->uma) {
+ ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]);
+ uma[i] = buf[i] != nullptr;
+ }
+ if (!uma[i]) {
+ buf[i] = buf_ctx[i]->dev_buffer;
+ offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs;
+ }
+ GGML_ASSERT(buf[i] != nullptr);
+ }
+ // If any remaining descriptors are unused, just point them at src[0]
+ for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) {
+ buf[i] = buf[0];
+ offset[i] = 0;
+ }
+
+ std::array<uint32_t, 3> elements;
+
+ uint32_t ne = ggml_nelements(dst);
+ if (ne > 262144) {
+ elements = { 512, 512, CEIL_DIV(ne, 262144) };
+ } else if (ne > 512) {
+ elements = { 512, CEIL_DIV(ne, 512), 1 };
+ } else {
+ elements = { ne, 1, 1 };
+ }
+
+ ggml_vk_sync_buffers(subctx);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+ {
+ vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE },
+ vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE },
+ }, pc, elements);
+}
+
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
break;
case GGML_OP_ADD:
- ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
-
+ if (ctx->num_additional_fused_ops) {
+ ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx, dryrun);
+ } else {
+ ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
+ }
break;
case GGML_OP_SUB:
ggml_vk_sub(ctx, compute_ctx, src0, src1, node, dryrun);
return true;
}
+static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
+
+ const ggml_tensor *first_node = cgraph->nodes[node_idx];
+ if (first_node->op != GGML_OP_ADD) {
+ return 0;
+ }
+
+ if (!ctx->device->multi_add) {
+ return 0;
+ }
+
+ int32_t num_adds = 1;
+ while (node_idx + num_adds < cgraph->n_nodes &&
+ cgraph->nodes[node_idx + num_adds]->op == GGML_OP_ADD &&
+ num_adds < MAX_FUSED_ADDS) {
+ num_adds++;
+ }
+
+ // The shader currently requires same shapes (but different strides are allowed),
+ // everything f32, and no misalignment
+ for (int32_t i = 0; i < num_adds; ++i) {
+ const ggml_tensor *next_node = cgraph->nodes[node_idx + i];
+ if (!ggml_are_same_shape(first_node, next_node->src[0]) ||
+ !ggml_are_same_shape(first_node, next_node->src[1]) ||
+ next_node->type != GGML_TYPE_F32 ||
+ next_node->src[0]->type != GGML_TYPE_F32 ||
+ next_node->src[1]->type != GGML_TYPE_F32 ||
+ get_misalign_bytes(ctx, next_node) ||
+ get_misalign_bytes(ctx, next_node->src[0]) ||
+ get_misalign_bytes(ctx, next_node->src[1])) {
+ num_adds = i;
+ }
+ }
+
+ // Verify we can fuse these
+ ggml_op adds[MAX_FUSED_ADDS];
+ for (int32_t i = 0; i < num_adds; ++i) {
+ adds[i] = GGML_OP_ADD;
+ }
+
+ // decrease num_adds if they can't all be fused
+ while (num_adds > 1 && !ggml_can_fuse(cgraph, node_idx, adds, num_adds)) {
+ num_adds--;
+ }
+
+ // a single add is not "fused", so just return zero
+ if (num_adds == 1) {
+ return 0;
+ }
+ return num_adds;
+}
+
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
uint64_t total_mat_mul_bytes = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
- if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
- ctx->num_additional_fused_ops = 1;
+ if (!ctx->device->disable_fusion) {
+ uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
+ if (num_adds) {
+ ctx->num_additional_fused_ops = num_adds - 1;
+ } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+ ctx->num_additional_fused_ops = 1;
+ }
}
ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
}
- if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
- ctx->num_additional_fused_ops = 1;
+ if (!ctx->device->disable_fusion) {
+ uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
+ if (num_adds) {
+ ctx->num_additional_fused_ops = num_adds - 1;
+ } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+ ctx->num_additional_fused_ops = 1;
+ }
}
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)