}
struct llm_build_llama : public llm_graph_context {
- llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
};
struct llm_build_llama_iswa : public llm_graph_context {
- llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur_normed", il);
}
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
};
struct llm_build_deci : public llm_graph_context {
- llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
};
struct llm_build_baichuan : public llm_graph_context {
- llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_xverse : public llm_graph_context {
- llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_falcon : public llm_graph_context {
- llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_grok : public llm_graph_context {
- llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
};
struct llm_build_dbrx : public llm_graph_context {
- llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_starcoder : public llm_graph_context {
- llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_refact : public llm_graph_context {
- llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_bert : public llm_graph_context {
- llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
};
struct llm_build_neo_bert : public llm_graph_context {
- llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
};
struct llm_build_bloom : public llm_graph_context {
- llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_mpt : public llm_graph_context {
- llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_stablelm : public llm_graph_context {
- llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen : public llm_graph_context {
- llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen2 : public llm_graph_context {
- llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_dream : public llm_graph_context {
- llm_build_dream(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) :
+ llm_build_dream(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
//copied from qwen2
const int64_t n_embd_head = hparams.n_embd_head_v;
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
+ cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen2vl : public llm_graph_context {
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen2moe : public llm_graph_context {
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen3 : public llm_graph_context {
- llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_qwen3moe : public llm_graph_context {
- llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_phi2 : public llm_graph_context {
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
template<bool iswa>
struct llm_build_phi3 : public llm_graph_context {
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
};
struct llm_build_plamo : public llm_graph_context {
- llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_gpt2 : public llm_graph_context {
- llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_codeshell : public llm_graph_context {
- llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_orion : public llm_graph_context {
- llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_internlm2 : public llm_graph_context {
- llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_minicpm3 : public llm_graph_context {
- llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
cb(k_states, "k_states", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
};
struct llm_build_gemma : public llm_graph_context {
- llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
ggml_tensor * cur;
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur_scaled", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
};
struct llm_build_gemma2_iswa : public llm_graph_context {
- llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
ggml_tensor * cur;
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
};
struct llm_build_gemma3_iswa : public llm_graph_context {
- llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
ggml_tensor * cur;
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
}
struct llm_build_gemma3n_iswa : public llm_graph_context {
const llama_model & model;
- ggml_cgraph * gf;
const int64_t n_embd_head;
const int64_t n_embd_altup;
const int n_layer_sparsity = 10; // number of layers using activation sparsity
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
- llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
: llm_graph_context(params),
model(model),
- gf(gf),
n_embd_head(model.hparams.n_embd_head_k),
n_embd_altup(model.hparams.n_embd_altup),
n_altup(model.hparams.n_altup),
cb(Qcur, "Qcur_pos", il);
cb(Kcur, "Kcur_pos", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
} else {
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_pos", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
}
// TODO: move up next to build_starcoder
struct llm_build_starcoder2 : public llm_graph_context {
- llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
ggml_tensor * build_mamba_layer(
llm_graph_input_rs * inp,
- ggml_cgraph * gf,
ggml_tensor * cur,
const llama_model & model,
const llama_ubatch & ubatch,
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
};
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
// store last states
ggml_build_forward_expand(gf,
ggml_tensor * build_mamba2_layer(
llm_graph_input_rs * inp,
- ggml_cgraph * gf,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) const {
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) const {
const auto * mctx_cur = inp->mctx;
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
};
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
// store last states
ggml_build_forward_expand(gf,
};
struct llm_build_mamba : public llm_graph_context_mamba {
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
cb(cur, "attn_norm", il);
if (model.arch == LLM_ARCH_MAMBA2) {
- cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
+ cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
} else {
- cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
+ cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
}
if (il == n_layer - 1 && inp_out_ids) {
};
struct llm_build_jamba : public llm_graph_context_mamba {
- llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
ggml_tensor * cur;
cb(cur, "attn_norm", il);
if (n_head_kv == 0) {
- cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
+ cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
} else {
// Attention
cb(Vcur, "Vcur", il);
// No RoPE :)
- cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
+ cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
};
struct llm_build_command_r : public llm_graph_context {
- llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_cohere2_iswa : public llm_graph_context {
- llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
// * removed bias
// * removed MoE
struct llm_build_olmo : public llm_graph_context {
- llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_olmo2 : public llm_graph_context {
- llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
// * removed bias
// * added q, k norm
struct llm_build_olmoe : public llm_graph_context {
- llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_openelm : public llm_graph_context {
- llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Qcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_gptneox : public llm_graph_context {
- llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_arctic : public llm_graph_context {
- llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_deepseek : public llm_graph_context {
- llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
};
struct llm_build_deepseek2 : public llm_graph_context {
- llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
bool is_lite = (hparams.n_layer == 27);
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
cb(Vcur, "Vcur", il);
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
cb(Kcur, "Kcur", il);
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
};
struct llm_build_bitnet : public llm_graph_context {
- llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
NULL, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
};
struct llm_build_t5_enc : public llm_graph_context {
- llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo_enc, nullptr,
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
};
struct llm_build_t5_dec : public llm_graph_context {
- llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
- cur = build_attn(inp_attn_self, gf,
+ cur = build_attn(inp_attn_self,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
- cur = build_attn(inp_attn_cross, gf,
+ cur = build_attn(inp_attn_cross,
model.layers[il].wo_cross, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
};
struct llm_build_jais : public llm_graph_context {
- llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
};
struct llm_build_chatglm : public llm_graph_context {
- llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_glm4 : public llm_graph_context {
- llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_nemotron : public llm_graph_context {
- llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_exaone : public llm_graph_context {
- llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
ggml_tensor * build_rwkv6_time_mix(
llm_graph_input_rs * inp,
- ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * x_prev,
const llama_ubatch & ubatch,
}
ggml_tensor * wkv_state = build_rs(
- inp, gf, mctx_cur->get_s_l(il),
+ inp, mctx_cur->get_s_l(il),
hparams.n_embd_s(), n_seqs);
ggml_tensor * wkv_output;
};
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
- llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
+ llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
GGML_ASSERT(hparams.token_shift_count == 2);
ggml_tensor * cur;
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
1
);
- cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
- llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
+ llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
GGML_ASSERT(n_embd == hparams.n_embd_r());
ggml_tensor * cur;
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
cb(att_norm, "attn_norm", il);
1
);
- cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
ggml_tensor * build_rwkv7_time_mix(
llm_graph_input_rs * inp,
- ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * x_prev,
ggml_tensor *& first_layer_value,
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
ggml_tensor * wkv_state = build_rs(
- inp, gf, mctx_cur->get_s_l(il),
+ inp, mctx_cur->get_s_l(il),
hparams.n_embd_s(), n_seqs);
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
};
struct llm_build_rwkv7 : public llm_build_rwkv7_base {
- llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
+ llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
GGML_ASSERT(hparams.token_shift_count == 2);
ggml_tensor * cur;
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
1
);
- cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
- llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
+ llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
GGML_ASSERT(n_embd == hparams.n_embd_r());
ggml_tensor * cur;
const llama_layer * layer = &model.layers[il];
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
cb(att_norm, "attn_norm", il);
1
);
- cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
struct llm_build_granite : public llm_graph_context {
llm_build_granite(
const llama_model & model,
- const llm_graph_params & params,
- ggml_cgraph * gf)
+ const llm_graph_params & params)
: llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
// self-attention
cur = build_attention_layer(
- gf, cur, inp_pos, inp_attn,
+ cur, inp_pos, inp_attn,
model, n_embd_head, il);
if (il == n_layer - 1 && inp_out_ids) {
}
ggml_tensor * build_attention_layer(
- ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
cb(Vcur, "Vcur", il);
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
};
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
-
llm_build_granite_hybrid(
const llama_model & model,
- const llm_graph_params & params,
- ggml_cgraph * gf) :
+ const llm_graph_params & params) :
llm_graph_context_mamba(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
if (hparams.is_recurrent(il)) {
// ssm layer //
- cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
} else {
// attention layer //
cur = build_attention_layer(
- gf, cur, inp_pos, inp->get_attn(), model,
+ cur, inp_pos, inp->get_attn(), model,
n_embd_head, il);
}
}
ggml_tensor * build_attention_layer(
- ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
cb(Vcur, "Vcur", il);
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
// * removed bias
// * removed MoE
struct llm_build_chameleon : public llm_graph_context {
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_wavtokenizer_dec : public llm_graph_context {
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
};
struct llm_build_plm : public llm_graph_context {
- llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
cb(k_states, "k_states", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
}
};
struct llm_build_bailingmoe : public llm_graph_context {
- llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}
};
struct llm_build_dots1 : public llm_graph_context {
- llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_ernie4_5 : public llm_graph_context {
- llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
};
struct llm_build_ernie4_5_moe : public llm_graph_context {
- llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
};
struct llm_build_falcon_h1 : public llm_graph_context_mamba {
- llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
ggml_tensor * cur;
cb(Kcur, "Kcur-post-rope", il);
cb(Vcur, "Vcur-post-rope", il);
- ggml_tensor * attn_out = build_attn(inp->get_attn(), gf,
+ ggml_tensor * attn_out = build_attn(inp->get_attn(),
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(attn_out, "attn_out", il);
// Mamba2 layer
cb(cur, "ssm_in", il);
- ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
+ ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
cb(ssm_out, "ssm_out", il);
// // Aggregation
};
struct llm_build_plamo2 : public llm_graph_context_mamba {
- llm_build_plamo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
+ llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
if (is_mamba_layer) {
// PLaMo-2 Mamba layer
- cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
+ cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
} else {
// PLaMo-2 Attention layer
- cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, gf, cur, model, il);
+ cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
}
// post_mixer_norm
ggml_tensor * build_plamo2_attn_layer(
llm_graph_input_attn_kv_unified * inp,
ggml_tensor * inp_pos,
- ggml_cgraph * gf,
ggml_tensor * cur,
const llama_model & model,
int il) {
ext_factor, attn_factor, beta_fast, beta_slow
);
- cur = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
+ cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
}
cb(cur, "attn_out", il);
ggml_tensor * build_plamo2_mamba_layer(
llm_graph_input_rs * inp,
- ggml_cgraph * gf,
ggml_tensor * cur,
const llama_model & model,
const llama_ubatch & ubatch,
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
};
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
cb(y_ssm, "mamba_ssm_scan", il);
// store last states
};
struct llm_build_arcee : public llm_graph_context {
- llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
};
struct llm_build_hunyuan_moe : public llm_graph_context {
- llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
LLM_NORM_RMS, il);
cb(Qcur, "Qcur_norm", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
};
struct llm_build_smollm3 : public llm_graph_context {
- llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
struct llm_build_lfm2 : public llm_graph_context {
const llama_model & model;
- llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
ggml_tensor * cur = build_inp_embd(model.tok_embd);
cb(cur, "model.embed_tokens", -1);
cb(cur, "model.layers.{}.operator_norm", il);
cur = hparams.is_recurrent(il) ?
- build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) :
- build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ;
+ build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
+ build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
return cur;
}
- ggml_tensor * build_attn_block(ggml_cgraph * gf,
- ggml_tensor * cur,
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
int il) const {
ext_factor, attn_factor, beta_fast, beta_slow
);
- cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL,
+ cur = build_attn(inp_attn, model.layers[il].wo, NULL,
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "model.layers.{}.self_attn.out_proj", il);
return cur;
}
- ggml_tensor * build_shortconv_block(ggml_cgraph * gf,
- ggml_tensor * cur,
+ ggml_tensor * build_shortconv_block(ggml_tensor * cur,
llm_graph_input_rs * inp_recr,
int il) {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
// read conv state
auto * conv_state = mctx_cur->get_r_l(il);
- auto * conv_rs = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
+ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
bx = ggml_concat(ctx0, conv, bx, 0);
}
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
- // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
- auto * gf = params.res->get_gf();
-
std::unique_ptr<llm_graph_context> llm;
switch (arch) {
case LLM_ARCH_LLAMA:
{
- llm = std::make_unique<llm_build_llama>(*this, params, gf);
+ llm = std::make_unique<llm_build_llama>(*this, params);
} break;
case LLM_ARCH_LLAMA4:
{
- llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
} break;
case LLM_ARCH_DECI:
{
- llm = std::make_unique<llm_build_deci>(*this, params, gf);
+ llm = std::make_unique<llm_build_deci>(*this, params);
} break;
case LLM_ARCH_BAICHUAN:
{
- llm = std::make_unique<llm_build_baichuan>(*this, params, gf);
+ llm = std::make_unique<llm_build_baichuan>(*this, params);
} break;
case LLM_ARCH_FALCON:
{
- llm = std::make_unique<llm_build_falcon>(*this, params, gf);
+ llm = std::make_unique<llm_build_falcon>(*this, params);
} break;
case LLM_ARCH_GROK:
{
- llm = std::make_unique<llm_build_grok>(*this, params, gf);
+ llm = std::make_unique<llm_build_grok>(*this, params);
} break;
case LLM_ARCH_STARCODER:
{
- llm = std::make_unique<llm_build_starcoder>(*this, params, gf);
+ llm = std::make_unique<llm_build_starcoder>(*this, params);
} break;
case LLM_ARCH_REFACT:
{
- llm = std::make_unique<llm_build_refact>(*this, params, gf);
+ llm = std::make_unique<llm_build_refact>(*this, params);
} break;
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
- llm = std::make_unique<llm_build_bert>(*this, params, gf);
+ llm = std::make_unique<llm_build_bert>(*this, params);
} break;
case LLM_ARCH_NEO_BERT:
{
- llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
+ llm = std::make_unique<llm_build_neo_bert>(*this, params);
} break;
case LLM_ARCH_BLOOM:
{
- llm = std::make_unique<llm_build_bloom>(*this, params, gf);
+ llm = std::make_unique<llm_build_bloom>(*this, params);
} break;
case LLM_ARCH_MPT:
{
- llm = std::make_unique<llm_build_mpt>(*this, params, gf);
+ llm = std::make_unique<llm_build_mpt>(*this, params);
} break;
case LLM_ARCH_STABLELM:
{
- llm = std::make_unique<llm_build_stablelm>(*this, params, gf);
+ llm = std::make_unique<llm_build_stablelm>(*this, params);
} break;
case LLM_ARCH_QWEN:
{
- llm = std::make_unique<llm_build_qwen>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen>(*this, params);
} break;
case LLM_ARCH_QWEN2:
{
- llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen2>(*this, params);
} break;
case LLM_ARCH_DREAM:
{
- llm = std::make_unique<llm_build_dream>(*this, params, gf);
+ llm = std::make_unique<llm_build_dream>(*this, params);
}
break;
case LLM_ARCH_QWEN2VL:
{
- llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen2vl>(*this, params);
} break;
case LLM_ARCH_QWEN2MOE:
{
- llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen2moe>(*this, params);
} break;
case LLM_ARCH_QWEN3:
{
- llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen3>(*this, params);
} break;
case LLM_ARCH_QWEN3MOE:
{
- llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params);
} break;
case LLM_ARCH_PHI2:
{
- llm = std::make_unique<llm_build_phi2>(*this, params, gf);
+ llm = std::make_unique<llm_build_phi2>(*this, params);
} break;
case LLM_ARCH_PHI3:
case LLM_ARCH_PHIMOE:
{
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
- llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params);
} else {
- llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params);
}
} break;
case LLM_ARCH_PLAMO:
{
- llm = std::make_unique<llm_build_plamo>(*this, params, gf);
+ llm = std::make_unique<llm_build_plamo>(*this, params);
} break;
case LLM_ARCH_PLAMO2:
{
- llm = std::make_unique<llm_build_plamo2>(*this, params, gf);
+ llm = std::make_unique<llm_build_plamo2>(*this, params);
} break;
case LLM_ARCH_GPT2:
{
- llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
+ llm = std::make_unique<llm_build_gpt2>(*this, params);
} break;
case LLM_ARCH_CODESHELL:
{
- llm = std::make_unique<llm_build_codeshell>(*this, params, gf);
+ llm = std::make_unique<llm_build_codeshell>(*this, params);
} break;
case LLM_ARCH_ORION:
{
- llm = std::make_unique<llm_build_orion>(*this, params, gf);
+ llm = std::make_unique<llm_build_orion>(*this, params);
} break;
case LLM_ARCH_INTERNLM2:
{
- llm = std::make_unique<llm_build_internlm2>(*this, params, gf);
+ llm = std::make_unique<llm_build_internlm2>(*this, params);
} break;
case LLM_ARCH_MINICPM3:
{
- llm = std::make_unique<llm_build_minicpm3>(*this, params, gf);
+ llm = std::make_unique<llm_build_minicpm3>(*this, params);
} break;
case LLM_ARCH_GEMMA:
{
- llm = std::make_unique<llm_build_gemma>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma>(*this, params);
} break;
case LLM_ARCH_GEMMA2:
{
- llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
} break;
case LLM_ARCH_GEMMA3:
{
- llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
} break;
case LLM_ARCH_GEMMA3N:
{
- llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
} break;
case LLM_ARCH_STARCODER2:
{
- llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
+ llm = std::make_unique<llm_build_starcoder2>(*this, params);
} break;
case LLM_ARCH_MAMBA:
case LLM_ARCH_MAMBA2:
{
- llm = std::make_unique<llm_build_mamba>(*this, params, gf);
+ llm = std::make_unique<llm_build_mamba>(*this, params);
} break;
case LLM_ARCH_JAMBA:
{
- llm = std::make_unique<llm_build_jamba>(*this, params, gf);
+ llm = std::make_unique<llm_build_jamba>(*this, params);
} break;
case LLM_ARCH_XVERSE:
{
- llm = std::make_unique<llm_build_xverse>(*this, params, gf);
+ llm = std::make_unique<llm_build_xverse>(*this, params);
} break;
case LLM_ARCH_COMMAND_R:
{
- llm = std::make_unique<llm_build_command_r>(*this, params, gf);
+ llm = std::make_unique<llm_build_command_r>(*this, params);
} break;
case LLM_ARCH_COHERE2:
{
- llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
} break;
case LLM_ARCH_DBRX:
{
- llm = std::make_unique<llm_build_dbrx>(*this, params, gf);
+ llm = std::make_unique<llm_build_dbrx>(*this, params);
} break;
case LLM_ARCH_OLMO:
{
- llm = std::make_unique<llm_build_olmo>(*this, params, gf);
+ llm = std::make_unique<llm_build_olmo>(*this, params);
} break;
case LLM_ARCH_OLMO2:
{
- llm = std::make_unique<llm_build_olmo2>(*this, params, gf);
+ llm = std::make_unique<llm_build_olmo2>(*this, params);
} break;
case LLM_ARCH_OLMOE:
{
- llm = std::make_unique<llm_build_olmoe>(*this, params, gf);
+ llm = std::make_unique<llm_build_olmoe>(*this, params);
} break;
case LLM_ARCH_OPENELM:
{
- llm = std::make_unique<llm_build_openelm>(*this, params, gf);
+ llm = std::make_unique<llm_build_openelm>(*this, params);
} break;
case LLM_ARCH_GPTNEOX:
{
- llm = std::make_unique<llm_build_gptneox>(*this, params, gf);
+ llm = std::make_unique<llm_build_gptneox>(*this, params);
} break;
case LLM_ARCH_ARCTIC:
{
- llm = std::make_unique<llm_build_arctic>(*this, params, gf);
+ llm = std::make_unique<llm_build_arctic>(*this, params);
} break;
case LLM_ARCH_DEEPSEEK:
{
- llm = std::make_unique<llm_build_deepseek>(*this, params, gf);
+ llm = std::make_unique<llm_build_deepseek>(*this, params);
} break;
case LLM_ARCH_DEEPSEEK2:
{
- llm = std::make_unique<llm_build_deepseek2>(*this, params, gf);
+ llm = std::make_unique<llm_build_deepseek2>(*this, params);
} break;
case LLM_ARCH_CHATGLM:
{
- llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
+ llm = std::make_unique<llm_build_chatglm>(*this, params);
} break;
case LLM_ARCH_GLM4:
{
- llm = std::make_unique<llm_build_glm4>(*this, params, gf);
+ llm = std::make_unique<llm_build_glm4>(*this, params);
} break;
case LLM_ARCH_BITNET:
{
- llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
+ llm = std::make_unique<llm_build_bitnet>(*this, params);
} break;
case LLM_ARCH_T5:
{
switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER:
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
break;
case LLM_GRAPH_TYPE_DEFAULT:
case LLM_GRAPH_TYPE_DECODER:
- llm = std::make_unique<llm_build_t5_dec>(*this, params, gf);
+ llm = std::make_unique<llm_build_t5_dec>(*this, params);
break;
default:
GGML_ABORT("invalid graph type");
} break;
case LLM_ARCH_T5ENCODER:
{
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
}
break;
case LLM_ARCH_JAIS:
{
- llm = std::make_unique<llm_build_jais>(*this, params, gf);
+ llm = std::make_unique<llm_build_jais>(*this, params);
} break;
case LLM_ARCH_NEMOTRON:
{
- llm = std::make_unique<llm_build_nemotron>(*this, params, gf);
+ llm = std::make_unique<llm_build_nemotron>(*this, params);
} break;
case LLM_ARCH_EXAONE:
{
- llm = std::make_unique<llm_build_exaone>(*this, params, gf);
+ llm = std::make_unique<llm_build_exaone>(*this, params);
} break;
case LLM_ARCH_RWKV6:
{
- llm = std::make_unique<llm_build_rwkv6>(*this, params, gf);
+ llm = std::make_unique<llm_build_rwkv6>(*this, params);
} break;
case LLM_ARCH_RWKV6QWEN2:
{
- llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params, gf);
+ llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
} break;
case LLM_ARCH_RWKV7:
{
- llm = std::make_unique<llm_build_rwkv7>(*this, params, gf);
+ llm = std::make_unique<llm_build_rwkv7>(*this, params);
} break;
case LLM_ARCH_ARWKV7:
{
- llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
+ llm = std::make_unique<llm_build_arwkv7>(*this, params);
} break;
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_MINICPM:
{
- llm = std::make_unique<llm_build_granite>(*this, params, gf);
+ llm = std::make_unique<llm_build_granite>(*this, params);
} break;
case LLM_ARCH_GRANITE_HYBRID:
{
- llm = std::make_unique<llm_build_granite_hybrid>(*this, params, gf);
+ llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
} break;
case LLM_ARCH_CHAMELEON:
{
- llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
+ llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
- llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
+ llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
} break;
case LLM_ARCH_PLM:
{
- llm = std::make_unique<llm_build_plm>(*this, params, gf);
+ llm = std::make_unique<llm_build_plm>(*this, params);
} break;
case LLM_ARCH_BAILINGMOE:
{
- llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params);
} break;
case LLM_ARCH_DOTS1:
{
- llm = std::make_unique<llm_build_dots1>(*this, params, gf);
+ llm = std::make_unique<llm_build_dots1>(*this, params);
} break;
case LLM_ARCH_ARCEE:
{
- llm = std::make_unique<llm_build_arcee>(*this, params, gf);
+ llm = std::make_unique<llm_build_arcee>(*this, params);
} break;
case LLM_ARCH_ERNIE4_5:
{
- llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params);
} break;
case LLM_ARCH_ERNIE4_5_MOE:
{
- llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params, gf);
+ llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
} break;
case LLM_ARCH_HUNYUAN_MOE:
{
- llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
} break;
case LLM_ARCH_SMOLLM3:
{
- llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
+ llm = std::make_unique<llm_build_smollm3>(*this, params);
} break;
case LLM_ARCH_FALCON_H1:
{
- llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params);
} break;
case LLM_ARCH_LFM2:
{
- llm = std::make_unique<llm_build_lfm2>(*this, params, gf);
+ llm = std::make_unique<llm_build_lfm2>(*this, params);
} break;
default:
GGML_ABORT("fatal error");
}
// add on pooling layer
- llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
+ llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
return llm->res->get_gf();
}