GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
}
}
- prev_i = i;
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
}
#endif
+ prev_i = i;
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
GGML_ABORT("fatal error");
}
- //expand here so that we can fuse ffn gate
- ggml_build_forward_expand(gf, cur);
-
if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
GGML_ABORT("fatal error");
}
- //expand here so that we can fuse ffn gate
- ggml_build_forward_expand(gf, cur);
-
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);