for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
-
-
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
prev_i = i;
continue;
}
+ // we don't support repeating adds
+ if (bias_op == GGML_OP_ADD &&
+ (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
+ !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
+ continue;
+ }
+
const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
continue;
}
+ if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
+ continue;
+ }
+
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;
ggml_tensor * build_graph(ggml_context * ctx) override {
if (!use_id) {
- std::array<int64_t, 4> ne = {k, m, 1, 1};
- std::array<int64_t, 4> ne0 = {k, n, 1, 1};
+ const int channels = 4;
+ const int samples = 2;
+ std::array<int64_t, 4> ne = { k, m, channels, samples };
+ std::array<int64_t, 4> ne0 = { k, n, channels, samples };
ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
if (with_bias) {
- std::array<int64_t, 4> bias_ne = {ffn_up->ne[0], 1, 1, 1};
+ std::array<int64_t, 4> bias_ne = { ffn_up->ne[0], 1, channels, samples };
ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
ffn_up = ggml_add(ctx, ffn_up, up_bias);
}
ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
if (with_bias && with_gate) {
- std::array<int64_t, 4> bias_ne = {ffn_gate->ne[0], 1, 1, 1};
+ std::array<int64_t, 4> bias_ne = { ffn_gate->ne[0], 1, channels, samples };
ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
}