llama : add support for DistilBert (#13907)

author Đinh Trọng Huy <redacted>

Fri, 30 May 2025 09:56:02 +0000 (18:56 +0900)

committer GitHub <redacted>

Fri, 30 May 2025 09:56:02 +0000 (11:56 +0200)
author Đinh Trọng Huy <redacted>
Fri, 30 May 2025 09:56:02 +0000 (18:56 +0900)
committer GitHub <redacted>
Fri, 30 May 2025 09:56:02 +0000 (11:56 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 868bb6826b904682abe60d40f402122e710525aa..5b4ed0d5d3f8feef9c62465fb9011193a49dbbc3 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -523,15 +523,15 @@ class TextModel(ModelBase):
              self.gguf_writer.add_context_length(n_ctx)
              logger.info(f"gguf: context length = {n_ctx}")
  
-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
              self.gguf_writer.add_embedding_length(n_embd)
              logger.info(f"gguf: embedding length = {n_embd}")
  
-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
              self.gguf_writer.add_feed_forward_length(n_ff)
              logger.info(f"gguf: feed forward length = {n_ff}")
  
-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
              self.gguf_writer.add_head_count(n_head)
              logger.info(f"gguf: head count = {n_head}")
  
@@ -3907,6 +3907,26 @@ class BertModel(TextModel):
          self.gguf_writer.add_add_eos_token(True)
  
  
+@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
+class DistilBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_layer_norm_eps(1e-12)
+        logger.info("gguf: layer norm epsilon = 1e-12")
+        super().set_gguf_parameters()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("distilbert."):
+            name = name[11:]
+
+        # These layers act as MLM head, so we don't need them
+        if name.startswith("vocab_"):
+            return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
  @ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
  class RobertaModel(BertModel):
      model_arch = gguf.MODEL_ARCH.BERT
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index d0dad7036bc468c4f8f1b21a40de5c4c79345032..93dd1d8028f3dc2fca53165a8d3ef73b1135c837 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -169,6 +169,7 @@ class TensorNameMap:
              "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
              "layers.{bid}.attention.wq",                                 # llama-pth
              "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.layer.{bid}.attention.q_lin",                   # distillbert
              "transformer.h.{bid}.attn.q_proj",                           # gpt-j
              "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
              "model.layers.{bid}.attention.wq",                           # internlm2
@@ -183,6 +184,7 @@ class TensorNameMap:
              "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
              "layers.{bid}.attention.wk",                               # llama-pth
              "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.layer.{bid}.attention.k_lin",                 # distillbert
              "transformer.h.{bid}.attn.k_proj",                         # gpt-j
              "transformer.h.{bid}.attn.k",                              # refact
              "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
@@ -197,6 +199,7 @@ class TensorNameMap:
              "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
              "layers.{bid}.attention.wv",                                 # llama-pth
              "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.layer.{bid}.attention.v_lin",                   # distillbert
              "transformer.h.{bid}.attn.v_proj",                           # gpt-j
              "transformer.h.{bid}.attn.v",                                # refact
              "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
@@ -217,6 +220,7 @@ class TensorNameMap:
              "model.layers.{bid}.self_attn.linear_attn",                     # deci
              "layers.{bid}.attention.wo",                                    # llama-pth
              "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "transformer.layer.{bid}.attention.out_lin",                    # distillbert
              "transformer.h.{bid}.attn.out_proj",                            # gpt-j
              "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
              "model.layers.{bid}.self_attn.dense",                           # persimmon
@@ -237,6 +241,7 @@ class TensorNameMap:
          # Attention output norm
          MODEL_TENSOR.ATTN_OUT_NORM: (
              "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
+            "transformer.layer.{bid}.sa_layer_norm",           # distillbert
              "encoder.layers.{bid}.norm1",                      # nomic-bert
              "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
              "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
@@ -313,6 +318,7 @@ class TensorNameMap:
              "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
              "layers.{bid}.feed_forward.w3",                           # llama-pth
              "encoder.layer.{bid}.intermediate.dense",                 # bert
+            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
              "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
              "transformer.h.{bid}.mlp.linear_3",                       # refact
              "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
@@ -396,6 +402,7 @@ class TensorNameMap:
              "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
              "layers.{bid}.feed_forward.w2",                           # llama-pth
              "encoder.layer.{bid}.output.dense",                       # bert
+            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
              "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
              "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
              "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
@@ -457,6 +464,7 @@ class TensorNameMap:
  
          MODEL_TENSOR.LAYER_OUT_NORM: (
              "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "transformer.layer.{bid}.output_layer_norm",    # distillbert
              "encoder.layers.{bid}.norm2",                   # nomic-bert
              "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
              "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
@@ -827,6 +835,7 @@ class TensorNameMap:
          MODEL_TENSOR.CLS: (
              "classifier",       # jina
              "classifier.dense", # roberta
+            "pre_classifier",   # distillbert
          ),
  
          MODEL_TENSOR.CLS_OUT: (
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index a1aa51412f8769abcfb83737a7105d8f07490fb1..3f1f6c9bf3b067536efe8ab7a04594aa543a4c0e 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2114,7 +2114,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              case LLM_ARCH_NOMIC_BERT_MOE:
                  {
                      tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  
                      if (arch == LLM_ARCH_BERT) {
                          pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
@@ -5885,8 +5885,10 @@ struct llm_build_bert : public llm_graph_context {
          inpL = build_inp_embd(model.tok_embd);
  
          // token types are hardcoded to zero ("Sentence A")
-        ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
-        inpL = ggml_add(ctx0, inpL, type_row0);
+        if (model.type_embd) {
+            ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+            inpL = ggml_add(ctx0, inpL, type_row0);
+        }
          if (model.arch == LLM_ARCH_BERT) {
              inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
          }
author	Đinh Trọng Huy <redacted>
	Fri, 30 May 2025 09:56:02 +0000 (18:56 +0900)
committer	GitHub <redacted>
	Fri, 30 May 2025 09:56:02 +0000 (11:56 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history