ggml_tensor * gpt_neox_ff(
const gpt_neox_block &block,
ggml_context * ctx0,
- ggml_tensor * inp) {
+ ggml_tensor * inp,
+ const gpt_neox_hparams &hparams) {
- ggml_tensor * cur = ggml_norm(ctx0, inp);
+ ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
// self-attention
{
{
- cur = ggml_norm(ctx0, inpL);
+ cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
- cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
+ cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
// input for next layer
inpL = ggml_add(ctx0, cur, inpFF);
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
// note here we pass inpL instead of cur
- cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
+ cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
// layer input + FF
cur = ggml_add(ctx0, cur, inpFF);
// norm
{
- inpL = ggml_norm(ctx0, inpL);
+ inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
// inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0,
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GPT2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ },
+ },
+ {
+ LLM_ARCH_GPTJ,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ },
+ },
+ {
+ LLM_ARCH_GPTNEOX,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
+ {
+ LLM_ARCH_MPT,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ },
+ },
+ {
+ LLM_ARCH_UNKNOWN,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ },
+ },
};
static llm_arch llm_arch_from_string(const std::string & name) {
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+ }
}
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+ // gpt-j n_rot = rotary_dim
}
// arch-specific KVs