self.gguf_writer.add_causal_attention(False)
self._try_set_pooling_type()
+ if cls_out_labels := self.hparams.get("id2label"):
+ key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
+ self.gguf_writer.add_array(key_name, [v for k, v in sorted(cls_out_labels.items())])
+
def set_vocab(self):
tokens, toktypes, tokpre = self.get_vocab_base()
self.vocab_size = len(tokens)
if name.startswith("cls.seq_relationship"):
return []
- # For BertForSequenceClassification (direct projection layer)
- if name == "classifier.weight":
- name = "classifier.out_proj.weight"
+ if self.hparams.get("id2label"):
+ # For BertForSequenceClassification (direct projection layer)
+ if name == "classifier.weight":
+ name = "classifier.out_proj.weight"
- if name == "classifier.bias":
- name = "classifier.out_proj.bias"
+ if name == "classifier.bias":
+ name = "classifier.out_proj.bias"
return [(self.map_tensor_name(name), data_torch)]
self.gguf_writer.add_add_eos_token(True)
-@ModelBase.register("RobertaModel")
+@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
class RobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
BLOCK_COUNT = "{arch}.convnext.block_count"
+ class Classifier:
+ OUTPUT_LABELS = "{arch}.classifier.output_labels"
+
class Tokenizer:
MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre"
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
+ { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
+
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
LLM_KV_CONVNEXT_BLOCK_COUNT,
+ LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
bool attn_soft_cap = false;
bool use_kq_norm = true;
+ // for Classifiers
+ uint32_t n_cls_out = 1;
+
// llama4
uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4;
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+ ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
switch (hparams.n_layer) {
case 3:
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
}
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);