llama : add Mistral Nemo inference support (#8604)

author Douglas Hanley <redacted>

Mon, 22 Jul 2024 08:06:17 +0000 (03:06 -0500)

committer GitHub <redacted>

Mon, 22 Jul 2024 08:06:17 +0000 (11:06 +0300)
author Douglas Hanley <redacted>
Mon, 22 Jul 2024 08:06:17 +0000 (03:06 -0500)
committer GitHub <redacted>
Mon, 22 Jul 2024 08:06:17 +0000 (11:06 +0300)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 139a92801fabe6bb441539d810145e25a9401b9a..59410d6ce00ddbfc4f14196176e3cc64d82abc2f 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -239,6 +239,10 @@ class Model:
              self.gguf_writer.add_expert_used_count(n_experts_used)
              logger.info(f"gguf: experts used count = {n_experts_used}")
  
+        if (head_dim := self.hparams.get("head_dim")) is not None:
+            self.gguf_writer.add_key_length(head_dim)
+            self.gguf_writer.add_value_length(head_dim)
+
          self.gguf_writer.add_file_type(self.ftype)
          logger.info(f"gguf: file type = {self.ftype}")
  
@@ -1481,7 +1485,12 @@ class LlamaModel(Model):
          super().set_gguf_parameters()
          hparams = self.hparams
          self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
  
          if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
              if self.hparams["rope_scaling"].get("type") == "linear":
diff --git a/src/llama.cpp b/src/llama.cpp

index 80a0dd0f4fb5e5c7a1ee6136bd03a885cc94e35e..6046e5615e39e8837fdeeeaab333b6ba017030b1 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6137,10 +6137,10 @@ static bool llm_load_tensors(
  
                          layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  
                          // optional bias tensors
                          layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
author	Douglas Hanley <redacted>
	Mon, 22 Jul 2024 08:06:17 +0000 (03:06 -0500)
committer	GitHub <redacted>
	Mon, 22 Jul 2024 08:06:17 +0000 (11:06 +0300)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama.cpp		patch \| blob \| history