llama : fix llama3.1 rope_freqs not respecting custom head_dim (#9141)

author Carsten Kragelund Jørgensen <redacted>

Tue, 27 Aug 2024 06:53:40 +0000 (08:53 +0200)

committer GitHub <redacted>

Tue, 27 Aug 2024 06:53:40 +0000 (09:53 +0300)
author Carsten Kragelund Jørgensen <redacted>
Tue, 27 Aug 2024 06:53:40 +0000 (08:53 +0200)
committer GitHub <redacted>
Tue, 27 Aug 2024 06:53:40 +0000 (09:53 +0300)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 25853623723e84432eede1c257c9f1cd26f14548..caa41aee5f30be7f63178aa4d6aa3ac0d2b0b11f 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1572,7 +1572,7 @@ class LlamaModel(Model):
          if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
              if rope_scaling.get("rope_type", '').lower() == "llama3":
                  base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
  
                  factor = rope_scaling.get("factor", 8.0)
@@ -3820,7 +3820,7 @@ class ExaoneModel(Model):
          if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
              if rope_scaling.get("rope_type", '').lower() == "llama3":
                  base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
  
                  factor = rope_scaling.get("factor", 8.0)
diff --git a/src/llama.cpp b/src/llama.cpp

index fc8fb3e0ddef2a87ca274fb676279678f7b67ecc..7c148b8305e1e6ca2bb28153105dc2d90893e0a5 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6605,6 +6605,7 @@ static bool llm_load_tensors(
          const int64_t n_embd_gqa    = n_embd_v_gqa;
          const int64_t n_vocab       = hparams.n_vocab;
          const int64_t n_vocab_type  = hparams.n_vocab_type;
+        const int64_t n_rot         = hparams.n_rot;
          const int64_t n_expert      = hparams.n_expert;
          const int64_t n_expert_used = hparams.n_expert_used;
          const int64_t n_ctx_train   = hparams.n_ctx_train;
@@ -6662,7 +6663,7 @@ static bool llm_load_tensors(
  
                          layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  
                          if (n_expert == 0) {
                              layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
@@ -8193,7 +8194,7 @@ static bool llm_load_tensors(
                          layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  
                          layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                          layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                          layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                          layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
author	Carsten Kragelund Jørgensen <redacted>
	Tue, 27 Aug 2024 06:53:40 +0000 (08:53 +0200)
committer	GitHub <redacted>
	Tue, 27 Aug 2024 06:53:40 +0000 (09:53 +0300)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama.cpp		patch \| blob \| history