hparams : support models for which all layers use SWA (#13682)

author Georgi Gerganov <redacted>

Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)

committer GitHub <redacted>

Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)
author Georgi Gerganov <redacted>
Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)
committer GitHub <redacted>
Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp

index 90dfe7a7fcc00f2f2504f6f11cef64b198c27fee..4f84e56b3d3aac90cb1e7adbfcca1c51f18f4073 100644 (file)
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -72,7 +72,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
  
  bool llama_hparams::is_swa(uint32_t il) const {
      if (il < n_layer) {
-        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
+        return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
      }
  
      GGML_ABORT("fatal error");
diff --git a/src/llama-hparams.h b/src/llama-hparams.h

index f865cbaea0240383bed276c03ef9eb684ad8014f..5222eedcfb0997509b510640d8b7dbd206eb6e77 100644 (file)
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -104,7 +104,18 @@ struct llama_hparams {
      llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
  
      uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
+    uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
+                                // by default n == 1, all layers are dense
+                                // note that if n_swa_pattern == 0, all layers are SWA
+                                // example: n_swa_pattern = 3
+                                //   il == 0: swa
+                                //   il == 1: swa
+                                //   il == 2: dense
+                                //   il == 3: swa
+                                //   il == 4: swa
+                                //   il == 5: dense
+                                //   il == 6: swa
+                                //   etc ...
  
      // for State Space Models
      uint32_t ssm_d_conv  = 0;
author	Georgi Gerganov <redacted>
	Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)
committer	GitHub <redacted>
	Wed, 21 May 2025 17:00:49 +0000 (20:00 +0300)
src/llama-hparams.cpp		patch \| blob \| history
src/llama-hparams.h		patch \| blob \| history