convert : add Llama4ForCausalLM (#16042)

author Xuan-Son Nguyen <redacted>

Wed, 17 Sep 2025 17:18:21 +0000 (00:18 +0700)

committer GitHub <redacted>

Wed, 17 Sep 2025 17:18:21 +0000 (19:18 +0200)
author Xuan-Son Nguyen <redacted>
Wed, 17 Sep 2025 17:18:21 +0000 (00:18 +0700)
committer GitHub <redacted>
Wed, 17 Sep 2025 17:18:21 +0000 (19:18 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index ce83f24695ec7d35ebe3ded1d55ae7f8bebafe7b..7ddec48ad71290411991b0ee9f9401c0789d6422 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2393,7 +2393,10 @@ class SmolVLMModel(MmprojModel):
          return [] # skip other tensors
  
  
-@ModelBase.register("Llama4ForConditionalGeneration")
+@ModelBase.register(
+    "Llama4ForConditionalGeneration",
+    "Llama4ForCausalLM",
+)
  class Llama4Model(LlamaModel):
      model_arch = gguf.MODEL_ARCH.LLAMA4
      undo_permute = False
@@ -2411,6 +2414,10 @@ class Llama4Model(LlamaModel):
          super().set_gguf_parameters()
          self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
          self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
+        if "layer_types" in self.hparams:
+            if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
+                # all layers are full attention (for MobileLLM), disable swa
+                self.gguf_writer.add_sliding_window(0)
  
      def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
          if name.startswith("language_model."):
diff --git a/src/llama-hparams.h b/src/llama-hparams.h

index 116d728e8c9f27b9991985055d322bb813573f8e..202cbbd1b288423d80bc1bfd2399fcc7160e46aa 100644 (file)
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -149,7 +149,7 @@ struct llama_hparams {
      bool causal_attn   = true;
      bool use_alibi     = false;
      bool attn_soft_cap = false;
-    bool use_kq_norm   = true;
+    bool use_kq_norm   = false;
  
      // for Classifiers
      uint32_t n_cls_out = 1;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 2be807a6a9dabb2a2db13c258dc20e504b8c5ada..981e57083c48d90610fd413496e90e14d858ad7f 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_80M:           return "80M";
          case LLM_TYPE_109M:          return "109M";
          case LLM_TYPE_137M:          return "137M";
+        case LLM_TYPE_140M:          return "140M";
          case LLM_TYPE_160M:          return "160M";
          case LLM_TYPE_190M:          return "190M";
          case LLM_TYPE_220M:          return "220M";
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_270M:          return "270M";
          case LLM_TYPE_335M:          return "335M";
          case LLM_TYPE_350M:          return "350M";
+        case LLM_TYPE_360M:          return "360M";
          case LLM_TYPE_410M:          return "410M";
          case LLM_TYPE_450M:          return "450M";
          case LLM_TYPE_475M:          return "475M";
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_700M:          return "700M";
          case LLM_TYPE_770M:          return "770M";
          case LLM_TYPE_780M:          return "780M";
+        case LLM_TYPE_950M:          return "950M";
          case LLM_TYPE_0_3B:          return "0.3B";
          case LLM_TYPE_0_5B:          return "0.5B";
          case LLM_TYPE_0_6B:          return "0.6B";
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
  
-                hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
-                hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
-                hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa == 0) {
+                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
+                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
+                } else {
+                    hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
+                    hparams.n_swa         = 8192;
+                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
+                }
  
                  switch (hparams.n_expert) {
+                    case 0: {
+                        // MobileLLM (no MoE)
+                        switch (hparams.n_embd) {
+                            case 2048: type = LLM_TYPE_140M; break;
+                            case 4096: type = LLM_TYPE_360M; break;
+                            case 6144: type = LLM_TYPE_950M; break;
+                            default:   type = LLM_TYPE_UNKNOWN;
+                        }
+                    } break;
                      case 16:  type = LLM_TYPE_17B_16E; break;
                      case 128: type = LLM_TYPE_17B_128E; break;
                      default:  type = LLM_TYPE_UNKNOWN;
                  }
  
-                if (type == LLM_TYPE_17B_128E) {
-                    hparams.use_kq_norm = false;
-                }
+                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
              } break;
          case LLM_ARCH_ARCEE:
              {
@@ -2454,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                      }
  
-                    GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
                      for (int i = 0; i < n_layer; ++i) {
-                        bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
+                        bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  
                          auto & layer = layers[i];
  
@@ -6328,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
                  cb(Kcur, "Kcur", il);
                  cb(Vcur, "Vcur", il);
  
+                if (hparams.use_kq_norm) {
+                    // Llama4TextL2Norm
+                    Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
+                    Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+                    cb(Qcur, "Qcur_normed", il);
+                    cb(Kcur, "Kcur_normed", il);
+                }
+
                  cur = build_attn(inp_attn,
                          model.layers[il].wo, model.layers[il].bo,
                          Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
@@ -6435,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
          for (int il = 0; il < n_layer; ++il) {
              ggml_tensor * inpSA = inpL;
  
-            const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
+            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+                                  (il + 1) % hparams.n_no_rope_layer_step != 0;
  
              // norm
              cur = build_norm(inpL,
@@ -18981,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
              } break;
          case LLM_ARCH_LLAMA4:
              {
-                llm = std::make_unique<llm_build_llama_iswa>(*this, params);
+                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
+                    llm = std::make_unique<llm_build_llama>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
+                }
              } break;
          case LLM_ARCH_DECI:
              {
diff --git a/src/llama-model.h b/src/llama-model.h

index 10b1767f27228fd55f6ad3546b0e81704918d97b..b1981978e3acf8753a96dceff561b1f6e5060cdf 100644 (file)
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -28,6 +28,7 @@ enum llm_type {
      LLM_TYPE_80M,
      LLM_TYPE_109M,
      LLM_TYPE_137M,
+    LLM_TYPE_140M,
      LLM_TYPE_160M,
      LLM_TYPE_190M,
      LLM_TYPE_220M,
@@ -36,6 +37,7 @@ enum llm_type {
      LLM_TYPE_270M,
      LLM_TYPE_335M,
      LLM_TYPE_350M,
+    LLM_TYPE_360M,
      LLM_TYPE_410M,
      LLM_TYPE_450M,
      LLM_TYPE_475M,
@@ -43,6 +45,7 @@ enum llm_type {
      LLM_TYPE_700M,
      LLM_TYPE_770M,
      LLM_TYPE_780M,
+    LLM_TYPE_950M,
      LLM_TYPE_0_3B,
      LLM_TYPE_0_5B,
      LLM_TYPE_0_6B,
author	Xuan-Son Nguyen <redacted>
	Wed, 17 Sep 2025 17:18:21 +0000 (00:18 +0700)
committer	GitHub <redacted>
	Wed, 17 Sep 2025 17:18:21 +0000 (19:18 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama-hparams.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-model.h		patch \| blob \| history