From: fairydreaming Date: Thu, 19 Dec 2024 09:37:12 +0000 (+0100) Subject: convert : Add support for Microsoft Phi-4 model (#10817) X-Git-Tag: upstream/0.0.4488~127 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=7585edbdebd02861e0994dae67c9338731fb3fc5;p=pkg%2Fggml%2Fsources%2Fllama.cpp convert : Add support for Microsoft Phi-4 model (#10817) * convert : use GPT2 vocab for Phi-4 model * convert : use null value of sliding_window to distinguish Phi-4 from other PHI3-based models * llama : do not use sliding window attention mask for Phi-4 model --------- Co-authored-by: Stanisław Szymczyk --- diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4a0b00f6..7b433ee6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2200,6 +2200,15 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -2316,7 +2325,11 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_embd = self.find_hparam(["hidden_size", "n_embd"]) diff --git a/src/llama.cpp b/src/llama.cpp index 94160d53..cec15a00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13333,7 +13333,13 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + struct ggml_tensor * KQ_mask = nullptr; + if (hparams.n_swa == 0) { + // Phi-4 doesn't use sliding window attention + KQ_mask = build_inp_KQ_mask(); + } else { + KQ_mask = build_inp_KQ_mask_swa(); + } for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -13391,7 +13397,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) {