GPTNEOX : int = auto()
MPT : int = auto()
STARCODER : int = auto()
+ BERT : int = auto()
class MODEL_TENSOR(IntEnum):
TOKEN_EMBD : int = auto()
+ TOKEN_TYPES : int = auto()
POS_EMBD : int = auto()
OUTPUT : int = auto()
OUTPUT_NORM : int = auto()
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
+ MODEL_ARCH.BERT: "bert",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
+ MODEL_ARCH.BERT: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.TOKEN_TYPES,
+ MODEL_TENSOR.POS_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
+ MODEL_ARCH.MPT: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_QKV,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
+ MODEL_ARCH.GPTJ: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
MODEL_ARCH.GPT2: [
# TODO
],
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Token embeddings
MODEL_TENSOR.TOKEN_EMBD: (
- "gpt_neox.embed_in", # gptneox
- "transformer.wte", # gpt2 mpt
- "transformer.word_embeddings", # falcon
- "model.embed_tokens", # llama-hf
- "tok_embeddings", # llama-pth
+ "gpt_neox.embed_in", # gptneox
+ "transformer.wte", # gpt2 gpt-j mpt
+ "transformer.word_embeddings", # falcon
+ "model.embed_tokens", # llama-hf
+ "tok_embeddings", # llama-pth
+ "embeddings.word_embeddings", # bert
+ ),
+
+ # Token type embeddings
+ MODEL_TENSOR.TOKEN_TYPES: (
+ "embeddings.token_type_embeddings", # bert
),
# Position embeddings
MODEL_TENSOR.POS_EMBD: (
- "transformer.wpe", # gpt2
+ "transformer.wpe", # gpt2
+ "embeddings.position_embeddings", # bert
),
# Output
MODEL_TENSOR.OUTPUT: (
- "embed_out", # gptneox
- "lm_head", # gpt2 mpt falcon llama-hf baichuan
- "output", # llama-pth
+ "embed_out", # gptneox
+ "lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan
+ "output", # llama-pth
),
# Output norm
MODEL_TENSOR.OUTPUT_NORM: (
- "gpt_neox.final_layer_norm", # gptneox
- "transformer.ln_f", # gpt2 falcon
- "model.norm", # llama-hf baichuan
- "norm", # llama-pth
+ "gpt_neox.final_layer_norm", # gptneox
+ "transformer.ln_f", # gpt2 gpt-j falcon
+ "model.norm", # llama-hf baichuan
+ "norm", # llama-pth
+ "embeddings.LayerNorm", # bert
+ "transformer.norm_f", # mpt
),
# Rope frequencies
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm
MODEL_TENSOR.ATTN_NORM: (
- "gpt_neox.layers.{bid}.input_layernorm", # gptneox
- "transformer.h.{bid}.ln_1", # gpt2
- "transformer.blocks.{bid}.norm_1", # mpt
- "transformer.h.{bid}.input_layernorm", # falcon7b
- "transformer.h.{bid}.ln_mlp", # falcon40b
- "model.layers.{bid}.input_layernorm", # llama-hf
- "layers.{bid}.attention_norm", # llama-pth
+ "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+ "transformer.h.{bid}.ln_1", # gpt2 gpt-j
+ "transformer.blocks.{bid}.norm_1", # mpt
+ "transformer.h.{bid}.input_layernorm", # falcon7b
+ "transformer.h.{bid}.ln_mlp", # falcon40b
+ "model.layers.{bid}.input_layernorm", # llama-hf
+ "layers.{bid}.attention_norm", # llama-pth
+ "encoder.layer.{bid}.attention.output.LayerNorm", # bert
),
# Attention norm 2
# Attention query-key-value
MODEL_TENSOR.ATTN_QKV: (
- "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
- "transformer.h.{bid}.attn.c_attn", # gpt2
- "transformer.blocks.{bid}.attn.Wqkv", # mpt
- "transformer.h.{bid}.self_attention.query_key_value", # falcon
+ "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
+ "transformer.h.{bid}.attn.c_attn", # gpt2
+ "transformer.blocks.{bid}.attn.Wqkv", # mpt
+ "transformer.h.{bid}.self_attention.query_key_value", # falcon
),
# Attention query
MODEL_TENSOR.ATTN_Q: (
- "model.layers.{bid}.self_attn.q_proj", # llama-hf
- "layers.{bid}.attention.wq", # llama-pth
+ "model.layers.{bid}.self_attn.q_proj", # llama-hf
+ "layers.{bid}.attention.wq", # llama-pth
+ "encoder.layer.{bid}.attention.self.query", # bert
+ "transformer.h.{bid}.attn.q_proj", # gpt-j
),
# Attention key
MODEL_TENSOR.ATTN_K: (
- "model.layers.{bid}.self_attn.k_proj", # llama-hf
- "layers.{bid}.attention.wk", # llama-pth
+ "model.layers.{bid}.self_attn.k_proj", # llama-hf
+ "layers.{bid}.attention.wk", # llama-pth
+ "encoder.layer.{bid}.attention.self.key", # bert
+ "transformer.h.{bid}.attn.k_proj", # gpt-j
),
# Attention value
MODEL_TENSOR.ATTN_V: (
- "model.layers.{bid}.self_attn.v_proj", # llama-hf
- "layers.{bid}.attention.wv", # llama-pth
+ "model.layers.{bid}.self_attn.v_proj", # llama-hf
+ "layers.{bid}.attention.wv", # llama-pth
+ "encoder.layer.{bid}.attention.self.value", # bert
+ "transformer.h.{bid}.attn.v_proj", # gpt-j
),
# Attention output
MODEL_TENSOR.ATTN_OUT: (
- "gpt_neox.layers.{bid}.attention.dense", # gptneox
- "transformer.h.{bid}.attn.c_proj", # gpt2
- "transformer.blocks.{bid}.attn.out_proj", # mpt
- "transformer.h.{bid}.self_attention.dense", # falcon
- "model.layers.{bid}.self_attn.o_proj", # llama-hf
- "layers.{bid}.attention.wo", # llama-pth
+ "gpt_neox.layers.{bid}.attention.dense", # gptneox
+ "transformer.h.{bid}.attn.c_proj", # gpt2
+ "transformer.blocks.{bid}.attn.out_proj", # mpt
+ "transformer.h.{bid}.self_attention.dense", # falcon
+ "model.layers.{bid}.self_attn.o_proj", # llama-hf
+ "layers.{bid}.attention.wo", # llama-pth
+ "encoder.layer.{bid}.attention.output.dense", # bert
+ "transformer.h.{bid}.attn.out_proj", # gpt-j
),
# Rotary embeddings
# Feed-forward norm
MODEL_TENSOR.FFN_NORM: (
- "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
- "transformer.h.{bid}.ln_2", # gpt2
- "transformer.blocks.{bid}.norm_2", # mpt
- "model.layers.{bid}.post_attention_layernorm", # llama-hf
- "layers.{bid}.ffn_norm", # llama-pth
+ "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
+ "transformer.h.{bid}.ln_2", # gpt2
+ "transformer.blocks.{bid}.norm_2", # mpt
+ "model.layers.{bid}.post_attention_layernorm", # llama-hf
+ "layers.{bid}.ffn_norm", # llama-pth
+ "encoder.layer.{bid}.output.LayerNorm", # bert
),
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
- "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
- "transformer.h.{bid}.mlp.c_fc", # gpt2
- "transformer.blocks.{bid}.ffn.up_proj", # mpt
- "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
- "model.layers.{bid}.mlp.up_proj", # llama-hf
- "layers.{bid}.feed_forward.w3", # llama-pth
+ "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
+ "transformer.h.{bid}.mlp.c_fc", # gpt2
+ "transformer.blocks.{bid}.ffn.up_proj", # mpt
+ "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
+ "model.layers.{bid}.mlp.up_proj", # llama-hf
+ "layers.{bid}.feed_forward.w3", # llama-pth
+ "encoder.layer.{bid}.intermediate.dense", # bert
+ "transformer.h.{bid}.mlp.fc_in", # gpt-j
),
# Feed-forward gate
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: (
- "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
- "transformer.h.{bid}.mlp.c_proj", # gpt2
- "transformer.blocks.{bid}.ffn.down_proj", # mpt
- "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
- "model.layers.{bid}.mlp.down_proj", # llama-hf
- "layers.{bid}.feed_forward.w2", # llama-pth
+ "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
+ "transformer.h.{bid}.mlp.c_proj", # gpt2
+ "transformer.blocks.{bid}.ffn.down_proj", # mpt
+ "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
+ "model.layers.{bid}.mlp.down_proj", # llama-hf
+ "layers.{bid}.feed_forward.w2", # llama-pth
+ "encoder.layer.{bid}.output.dense", # bert
+ "transformer.h.{bid}.mlp.fc_out", # gpt-j
),
}