//
uint32_t llama_context::graph_max_nodes() const {
- return std::max<uint32_t>(65536u, 5u*model.n_tensors());
+ return std::max<uint32_t>(1024u, 8u*model.n_tensors());
}
llm_graph_result * llama_context::get_gf_res_reserve() const {
}
// aggregate experts
+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+ // to avoid potentially a large number of add nodes during warmup
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = nullptr;
- for (int i = 0; i < n_expert_used; ++i) {
+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]);
}
}
- if (n_expert_used == 1) {
+ if (hparams.n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out);
}