return [(self.map_tensor_name(name), data_torch)]
-@Model.register("BertModel", "CamembertModel", "RobertaModel")
+@Model.register("BertModel", "CamembertModel")
class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT
return [(self.map_tensor_name(name), data_torch)]
+@Model.register("RobertaModel")
+class RobertaModel(BertModel):
+ model_arch = gguf.MODEL_ARCH.BERT
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # we need the pad_token_id to know how to chop down position_embd matrix
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+ self._position_offset = 1 + pad_token_id
+ if "max_position_embeddings" in self.hparams:
+ self.hparams["max_position_embeddings"] -= self._position_offset
+ else:
+ self._position_offset = None
+
+ def set_vocab(self):
+ """Support BPE tokenizers for roberta models"""
+ bpe_tok_path = self.dir_model / "tokenizer.json"
+ if bpe_tok_path.exists():
+ self._set_vocab_gpt2()
+ self.gguf_writer.add_add_bos_token(True)
+ self.gguf_writer.add_add_eos_token(True)
+
+ # we need this to validate the size of the token_type embeddings
+ # though currently we are passing all zeros to the token_type embeddings
+ # "Sequence A" or "Sequence B"
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+ else:
+ return super().set_vocab()
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # if name starts with "roberta.", remove the prefix
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+ if name.startswith("roberta."):
+ name = name[8:]
+
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+ if name == "embeddings.position_embeddings.weight":
+ if self._position_offset is not None:
+ data_torch = data_torch[self._position_offset:,:]
+
+ return super().modify_tensors(data_torch, name, bid)
+
+
@Model.register("NomicBertModel")
class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.NOMIC_BERT