if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED:
raise ValueError(f"Unprocessed experts: {experts}")
+@Model.register("T5ForConditionalGeneration")
+@Model.register("T5WithLMHeadModel")
+class T5Model(Model):
+ model_arch = gguf.MODEL_ARCH.T5
+
+ def set_vocab(self):
+ # to avoid TypeError: Descriptors cannot be created directly
+ # exception when importing sentencepiece_model_pb2
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+ from sentencepiece import SentencePieceProcessor
+ from sentencepiece import sentencepiece_model_pb2 as model
+
+ tokenizer_path = self.dir_model / 'spiece.model'
+
+ if not tokenizer_path.is_file():
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+ sentencepiece_model = model.ModelProto()
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
+
+ tokenizer = SentencePieceProcessor()
+ tokenizer.LoadFromFile(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+ piece = tokenizer.IdToPiece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.GetScore(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.IsUnknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.IsControl(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.IsUnused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.IsByte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ added_tokens_file = self.dir_model / 'added_tokens.json'
+ if added_tokens_file.is_file():
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
+ added_tokens_json = json.load(f)
+ for key in added_tokens_json:
+ token_id = added_tokens_json[key]
+ if (token_id >= vocab_size):
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+ continue
+
+ tokens[token_id] = key.encode("utf-8")
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+ if vocab_size > len(tokens):
+ pad_count = vocab_size - len(tokens)
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+ for i in range(1, pad_count + 1):
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+ scores.append(-1000.0)
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+ self.gguf_writer.add_tokenizer_model("t5")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+ self.gguf_writer.add_add_space_prefix(add_prefix)
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+ if precompiled_charsmap:
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ self.gguf_writer.add_add_bos_token(False)
+ self.gguf_writer.add_add_eos_token(True)
+
+ def set_gguf_parameters(self):
+ self.gguf_writer.add_name("T5")
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+ self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
+ self.gguf_writer.add_file_type(self.ftype)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
+ # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
+ # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
+ if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
+ logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+
###### CONVERSION LOGIC ######
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
+ DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
+ REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
class Tokenizer:
- MODEL = "tokenizer.ggml.model"
- PRE = "tokenizer.ggml.pre"
- LIST = "tokenizer.ggml.tokens"
- TOKEN_TYPE = "tokenizer.ggml.token_type"
- TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
- SCORES = "tokenizer.ggml.scores"
- MERGES = "tokenizer.ggml.merges"
- BOS_ID = "tokenizer.ggml.bos_token_id"
- EOS_ID = "tokenizer.ggml.eos_token_id"
- UNK_ID = "tokenizer.ggml.unknown_token_id"
- SEP_ID = "tokenizer.ggml.seperator_token_id"
- PAD_ID = "tokenizer.ggml.padding_token_id"
- CLS_ID = "tokenizer.ggml.cls_token_id"
- MASK_ID = "tokenizer.ggml.mask_token_id"
- ADD_BOS = "tokenizer.ggml.add_bos_token"
- ADD_EOS = "tokenizer.ggml.add_eos_token"
- ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
- HF_JSON = "tokenizer.huggingface.json"
- RWKV = "tokenizer.rwkv.world"
- CHAT_TEMPLATE = "tokenizer.chat_template"
- CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
- CHAT_TEMPLATES = "tokenizer.chat_templates"
+ MODEL = "tokenizer.ggml.model"
+ PRE = "tokenizer.ggml.pre"
+ LIST = "tokenizer.ggml.tokens"
+ TOKEN_TYPE = "tokenizer.ggml.token_type"
+ TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
+ SCORES = "tokenizer.ggml.scores"
+ MERGES = "tokenizer.ggml.merges"
+ BOS_ID = "tokenizer.ggml.bos_token_id"
+ EOS_ID = "tokenizer.ggml.eos_token_id"
+ UNK_ID = "tokenizer.ggml.unknown_token_id"
+ SEP_ID = "tokenizer.ggml.seperator_token_id"
+ PAD_ID = "tokenizer.ggml.padding_token_id"
+ CLS_ID = "tokenizer.ggml.cls_token_id"
+ MASK_ID = "tokenizer.ggml.mask_token_id"
+ ADD_BOS = "tokenizer.ggml.add_bos_token"
+ ADD_EOS = "tokenizer.ggml.add_eos_token"
+ ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
+ REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
+ PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
+ HF_JSON = "tokenizer.huggingface.json"
+ RWKV = "tokenizer.rwkv.world"
+ CHAT_TEMPLATE = "tokenizer.chat_template"
+ CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
+ CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants
- PREFIX_ID = "tokenizer.ggml.prefix_token_id"
- SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
- MIDDLE_ID = "tokenizer.ggml.middle_token_id"
- EOT_ID = "tokenizer.ggml.eot_token_id"
+ PREFIX_ID = "tokenizer.ggml.prefix_token_id"
+ SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
+ MIDDLE_ID = "tokenizer.ggml.middle_token_id"
+ EOT_ID = "tokenizer.ggml.eot_token_id"
#
class MODEL_ARCH(IntEnum):
- LLAMA = auto()
- FALCON = auto()
- BAICHUAN = auto()
- GROK = auto()
- GPT2 = auto()
- GPTJ = auto()
- GPTNEOX = auto()
- MPT = auto()
- STARCODER = auto()
- REFACT = auto()
- BERT = auto()
- NOMIC_BERT = auto()
+ LLAMA = auto()
+ FALCON = auto()
+ BAICHUAN = auto()
+ GROK = auto()
+ GPT2 = auto()
+ GPTJ = auto()
+ GPTNEOX = auto()
+ MPT = auto()
+ STARCODER = auto()
+ REFACT = auto()
+ BERT = auto()
+ NOMIC_BERT = auto()
JINA_BERT_V2 = auto()
- BLOOM = auto()
- STABLELM = auto()
- QWEN = auto()
- QWEN2 = auto()
- QWEN2MOE = auto()
- PHI2 = auto()
- PHI3 = auto()
- PLAMO = auto()
- CODESHELL = auto()
- ORION = auto()
- INTERNLM2 = auto()
- MINICPM = auto()
- GEMMA = auto()
- STARCODER2 = auto()
- MAMBA = auto()
- XVERSE = auto()
- COMMAND_R = auto()
- DBRX = auto()
- OLMO = auto()
- ARCTIC = auto()
- DEEPSEEK2 = auto()
- BITNET = auto()
+ BLOOM = auto()
+ STABLELM = auto()
+ QWEN = auto()
+ QWEN2 = auto()
+ QWEN2MOE = auto()
+ PHI2 = auto()
+ PHI3 = auto()
+ PLAMO = auto()
+ CODESHELL = auto()
+ ORION = auto()
+ INTERNLM2 = auto()
+ MINICPM = auto()
+ GEMMA = auto()
+ STARCODER2 = auto()
+ MAMBA = auto()
+ XVERSE = auto()
+ COMMAND_R = auto()
+ DBRX = auto()
+ OLMO = auto()
+ ARCTIC = auto()
+ DEEPSEEK2 = auto()
+ BITNET = auto()
+ T5 = auto()
class MODEL_TENSOR(IntEnum):
- TOKEN_EMBD = auto()
- TOKEN_EMBD_NORM = auto()
- TOKEN_TYPES = auto()
- POS_EMBD = auto()
- OUTPUT = auto()
- OUTPUT_NORM = auto()
- ROPE_FREQS = auto()
- ROPE_FACTORS_LONG = auto()
- ROPE_FACTORS_SHORT = auto()
- ATTN_Q = auto()
- ATTN_K = auto()
- ATTN_V = auto()
- ATTN_QKV = auto()
- ATTN_OUT = auto()
- ATTN_NORM = auto()
- ATTN_NORM_2 = auto()
- ATTN_OUT_NORM = auto()
- ATTN_ROT_EMBD = auto()
- FFN_GATE_INP = auto()
- FFN_GATE_INP_SHEXP = auto()
- FFN_NORM = auto()
- FFN_GATE = auto()
- FFN_DOWN = auto()
- FFN_UP = auto()
- FFN_ACT = auto()
- FFN_NORM_EXP = auto()
- FFN_GATE_EXP = auto()
- FFN_DOWN_EXP = auto()
- FFN_UP_EXP = auto()
- FFN_GATE_SHEXP = auto()
- FFN_DOWN_SHEXP = auto()
- FFN_UP_SHEXP = auto()
- ATTN_Q_NORM = auto()
- ATTN_K_NORM = auto()
- LAYER_OUT_NORM = auto()
- SSM_IN = auto()
- SSM_CONV1D = auto()
- SSM_X = auto()
- SSM_DT = auto()
- SSM_A = auto()
- SSM_D = auto()
- SSM_OUT = auto()
- ATTN_Q_A = auto()
- ATTN_Q_B = auto()
- ATTN_KV_A_MQA = auto()
- ATTN_KV_B = auto()
- ATTN_Q_A_NORM = auto()
- ATTN_KV_A_NORM = auto()
- FFN_SUB_NORM = auto()
- ATTN_SUB_NORM = auto()
+ TOKEN_EMBD = auto()
+ TOKEN_EMBD_NORM = auto()
+ TOKEN_TYPES = auto()
+ POS_EMBD = auto()
+ OUTPUT = auto()
+ OUTPUT_NORM = auto()
+ ROPE_FREQS = auto()
+ ROPE_FACTORS_LONG = auto()
+ ROPE_FACTORS_SHORT = auto()
+ ATTN_Q = auto()
+ ATTN_K = auto()
+ ATTN_V = auto()
+ ATTN_QKV = auto()
+ ATTN_OUT = auto()
+ ATTN_NORM = auto()
+ ATTN_NORM_2 = auto()
+ ATTN_OUT_NORM = auto()
+ ATTN_ROT_EMBD = auto()
+ FFN_GATE_INP = auto()
+ FFN_GATE_INP_SHEXP = auto()
+ FFN_NORM = auto()
+ FFN_GATE = auto()
+ FFN_DOWN = auto()
+ FFN_UP = auto()
+ FFN_ACT = auto()
+ FFN_NORM_EXP = auto()
+ FFN_GATE_EXP = auto()
+ FFN_DOWN_EXP = auto()
+ FFN_UP_EXP = auto()
+ FFN_GATE_SHEXP = auto()
+ FFN_DOWN_SHEXP = auto()
+ FFN_UP_SHEXP = auto()
+ ATTN_Q_NORM = auto()
+ ATTN_K_NORM = auto()
+ LAYER_OUT_NORM = auto()
+ SSM_IN = auto()
+ SSM_CONV1D = auto()
+ SSM_X = auto()
+ SSM_DT = auto()
+ SSM_A = auto()
+ SSM_D = auto()
+ SSM_OUT = auto()
+ ATTN_Q_A = auto()
+ ATTN_Q_B = auto()
+ ATTN_KV_A_MQA = auto()
+ ATTN_KV_B = auto()
+ ATTN_Q_A_NORM = auto()
+ ATTN_KV_A_NORM = auto()
+ FFN_SUB_NORM = auto()
+ ATTN_SUB_NORM = auto()
+ DEC_ATTN_NORM = auto()
+ DEC_ATTN_Q = auto()
+ DEC_ATTN_K = auto()
+ DEC_ATTN_V = auto()
+ DEC_ATTN_OUT = auto()
+ DEC_ATTN_REL_B = auto()
+ DEC_CROSS_ATTN_NORM = auto()
+ DEC_CROSS_ATTN_Q = auto()
+ DEC_CROSS_ATTN_K = auto()
+ DEC_CROSS_ATTN_V = auto()
+ DEC_CROSS_ATTN_OUT = auto()
+ DEC_CROSS_ATTN_REL_B = auto()
+ DEC_FFN_NORM = auto()
+ DEC_FFN_GATE = auto()
+ DEC_FFN_DOWN = auto()
+ DEC_FFN_UP = auto()
+ DEC_OUTPUT_NORM = auto()
+ ENC_ATTN_NORM = auto()
+ ENC_ATTN_Q = auto()
+ ENC_ATTN_K = auto()
+ ENC_ATTN_V = auto()
+ ENC_ATTN_OUT = auto()
+ ENC_ATTN_REL_B = auto()
+ ENC_FFN_NORM = auto()
+ ENC_FFN_GATE = auto()
+ ENC_FFN_DOWN = auto()
+ ENC_FFN_UP = auto()
+ ENC_OUTPUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.BITNET: "bitnet",
+ MODEL_ARCH.T5: "t5",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
- MODEL_TENSOR.TOKEN_EMBD: "token_embd",
- MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
- MODEL_TENSOR.TOKEN_TYPES: "token_types",
- MODEL_TENSOR.POS_EMBD: "position_embd",
- MODEL_TENSOR.OUTPUT_NORM: "output_norm",
- MODEL_TENSOR.OUTPUT: "output",
- MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
- MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
- MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
- MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
- MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
- MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
- MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
- MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
- MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
- MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
- MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
- MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
- MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
- MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
- MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
- MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
- MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
- MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
- MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
- MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
- MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
- MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
- MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
- MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
- MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
- MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
- MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
- MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
- MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
- MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
- MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
- MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
- MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
- MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
- MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
- MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
- MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
- MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
- MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
- MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
- MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
- MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
- MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
- MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
+ MODEL_TENSOR.TOKEN_EMBD: "token_embd",
+ MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
+ MODEL_TENSOR.POS_EMBD: "position_embd",
+ MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+ MODEL_TENSOR.OUTPUT: "output",
+ MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
+ MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
+ MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
+ MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
+ MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+ MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
+ MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
+ MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
+ MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
+ MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
+ MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+ MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
+ MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
+ MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
+ MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
+ MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
+ MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
+ MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
+ MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
+ MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
+ MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
+ MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
+ MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
+ MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
+ MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
+ MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
+ MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
+ MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
+ MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
+ MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
+ MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
+ MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
+ MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
+ MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
+ MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
+ MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
+ MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
+ MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
+ MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
+ MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
+ MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
+ MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
+ MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
+ MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
+ MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
+ MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
+ MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
+ MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
+ MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
+ MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
+ MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
+ MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
+ MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
+ MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
+ MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
+ MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
+ MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
+ MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
+ MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
+ MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
+ MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
+ MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
+ MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
+ MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
+ MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
+ MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
+ MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
+ MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
+ MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
+ MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
+ MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
+ MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_SUB_NORM,
MODEL_TENSOR.FFN_SUB_NORM,
],
+ MODEL_ARCH.T5: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.DEC_ATTN_NORM,
+ MODEL_TENSOR.DEC_ATTN_Q,
+ MODEL_TENSOR.DEC_ATTN_K,
+ MODEL_TENSOR.DEC_ATTN_V,
+ MODEL_TENSOR.DEC_ATTN_OUT,
+ MODEL_TENSOR.DEC_ATTN_REL_B,
+ MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
+ MODEL_TENSOR.DEC_CROSS_ATTN_Q,
+ MODEL_TENSOR.DEC_CROSS_ATTN_K,
+ MODEL_TENSOR.DEC_CROSS_ATTN_V,
+ MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
+ MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
+ MODEL_TENSOR.DEC_FFN_NORM,
+ MODEL_TENSOR.DEC_FFN_GATE,
+ MODEL_TENSOR.DEC_FFN_DOWN,
+ MODEL_TENSOR.DEC_FFN_UP,
+ MODEL_TENSOR.DEC_OUTPUT_NORM,
+ MODEL_TENSOR.ENC_ATTN_NORM,
+ MODEL_TENSOR.ENC_ATTN_Q,
+ MODEL_TENSOR.ENC_ATTN_K,
+ MODEL_TENSOR.ENC_ATTN_V,
+ MODEL_TENSOR.ENC_ATTN_OUT,
+ MODEL_TENSOR.ENC_ATTN_REL_B,
+ MODEL_TENSOR.ENC_FFN_NORM,
+ MODEL_TENSOR.ENC_FFN_GATE,
+ MODEL_TENSOR.ENC_FFN_DOWN,
+ MODEL_TENSOR.ENC_FFN_UP,
+ MODEL_TENSOR.ENC_OUTPUT_NORM,
+ ],
# TODO
}
"backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok
+ "shared", # t5
),
# Token type embeddings
MODEL_TENSOR.FFN_SUB_NORM: (
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
),
+
+ MODEL_TENSOR.DEC_ATTN_NORM: (
+ "decoder.block.{bid}.layer.0.layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.DEC_ATTN_Q: (
+ "decoder.block.{bid}.layer.0.SelfAttention.q", # t5
+ ),
+
+ MODEL_TENSOR.DEC_ATTN_K: (
+ "decoder.block.{bid}.layer.0.SelfAttention.k", # t5
+ ),
+
+ MODEL_TENSOR.DEC_ATTN_V: (
+ "decoder.block.{bid}.layer.0.SelfAttention.v", # t5
+ ),
+
+ MODEL_TENSOR.DEC_ATTN_OUT: (
+ "decoder.block.{bid}.layer.0.SelfAttention.o", # t5
+ ),
+
+ MODEL_TENSOR.DEC_ATTN_REL_B: (
+ "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
+ "decoder.block.{bid}.layer.1.layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
+ "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_K: (
+ "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_V: (
+ "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
+ "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
+ ),
+
+ MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
+ "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
+ ),
+
+ MODEL_TENSOR.DEC_FFN_NORM: (
+ "decoder.block.{bid}.layer.2.layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.DEC_FFN_GATE: (
+ "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
+ ),
+
+ MODEL_TENSOR.DEC_FFN_UP: (
+ "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
+ "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
+ ),
+
+ MODEL_TENSOR.DEC_FFN_DOWN: (
+ "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
+ ),
+
+ MODEL_TENSOR.DEC_OUTPUT_NORM: (
+ "decoder.final_layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_NORM: (
+ "encoder.block.{bid}.layer.0.layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_Q: (
+ "encoder.block.{bid}.layer.0.SelfAttention.q", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_K: (
+ "encoder.block.{bid}.layer.0.SelfAttention.k", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_V: (
+ "encoder.block.{bid}.layer.0.SelfAttention.v", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_OUT: (
+ "encoder.block.{bid}.layer.0.SelfAttention.o", # t5
+ ),
+
+ MODEL_TENSOR.ENC_ATTN_REL_B: (
+ "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
+ ),
+
+ MODEL_TENSOR.ENC_FFN_NORM: (
+ "encoder.block.{bid}.layer.1.layer_norm", # t5
+ ),
+
+ MODEL_TENSOR.ENC_FFN_GATE: (
+ "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
+ ),
+
+ MODEL_TENSOR.ENC_FFN_UP: (
+ "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
+ "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
+ ),
+
+ MODEL_TENSOR.ENC_FFN_DOWN: (
+ "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
+ ),
+
+ MODEL_TENSOR.ENC_OUTPUT_NORM: (
+ "encoder.final_layer_norm", # t5
+ ),
}
# architecture-specific block mappings