llama : add Phi-4-mini support (supersede #12099) (#12108)

author Xuan-Son Nguyen <redacted>

Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)

committer GitHub <redacted>

Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)
author Xuan-Son Nguyen <redacted>
Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)
committer GitHub <redacted>
Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 8b7c75d85a6f557f78487c3482659286d1d083cc..6358a94e9b55f8d21f564c13427a80ef07385aef 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -699,6 +699,9 @@ class Model:
          if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
              # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
              res = "deepseek-r1-qwen"
+        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
+            # ref: https://huggingface.co/Xenova/gpt-4o
+            res = "gpt-4o"
  
          if res is None:
              logger.warning("\n")
@@ -2512,7 +2515,8 @@ class Phi3MiniModel(Model):
          rms_eps = self.find_hparam(["rms_norm_eps"])
          max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
          orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
  
          self.gguf_writer.add_context_length(max_pos_embds)
          self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2536,7 +2540,8 @@ class Phi3MiniModel(Model):
          n_head = self.find_hparam(["num_attention_heads", "n_head"])
          max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
          orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
  
          # write rope scaling for long context (128k) model
          rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2565,7 +2570,7 @@ class Phi3MiniModel(Model):
              raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
  
          if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
  
          yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
          yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index fa4989a80c5447c9b981fc5e1e6bcf3e06a7af4a..07d3ce0e4eb78bbedaa4f4e40db2eeb8f227fce8 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -109,6 +109,7 @@ models = [
      {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
      {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
      {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
  ]
  
  
@@ -131,6 +132,10 @@ def download_model(model):
  
      files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
  
+    if name == "gpt-4o":
+        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
+        files = ["tokenizer.json", "tokenizer_config.json"]
+
      if tokt == TOKENIZER_TYPE.SPM:
          files.append("tokenizer.model")
  
diff --git a/include/llama.h b/include/llama.h

index 479196026b93bf5f3a7600f5a4c100326ebad152..ee6e73915f136aa999f74ce4020b58e17d1d9028 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -105,6 +105,7 @@ extern "C" {
          LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
          LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
          LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
      };
  
      enum llama_rope_type {
diff --git a/models/ggml-vocab-gpt-4o.gguf.inp b/models/ggml-vocab-gpt-4o.gguf.inp

new file mode 100644 (file)

index 0000000..9baf7d7
--- /dev/null
+++ b/models/ggml-vocab-gpt-4o.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+       
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+       
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+                               
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-gpt-4o.gguf.out b/models/ggml-vocab-gpt-4o.gguf.out

new file mode 100644 (file)

index 0000000..478df72
--- /dev/null
+++ b/models/ggml-vocab-gpt-4o.gguf.out
@@ -0,0 +1,46 @@
+ 1165 220 19 220 27124 5503
+ 37 19194 259
+
+ 220
+ 256
+ 271
+ 197
+ 198
+ 279
+ 2499
+ 2775
+ 13225 2375
+ 32949 2375
+ 13225 5922
+ 32949 5922
+ 32949 5922 0
+ 13225 11 2375 0
+ 32949 11 2375 0
+ 495 382 9552 99 247 13 17159
+ 86 45404 220 22 10191 2852 22924 4750 6916
+ 3907 53641 1235 185386 8118
+ 11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
+ 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
+ 13225
+ 32949
+ 220 32949
+ 256 32949
+ 271 32949
+ 271 32949 198 271 32949
+ 350
+ 198 314
+ 6 6837
+ 13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
+ 147475
+ 18
+ 2546
+ 15517
+ 15517 18
+ 15517 2546
+ 15517 15517
+ 15517 15517 18
+ 15517 15517 2546
+ 15517 15517 15517
+ 34 60213 53904
+ 2960 3098
+ 126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 36a0a009c45672d1c4b17f2489b48c3759589810..1da4eae7e63e2aa435cfd52a0b3a6e9772218ccf 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2202,13 +2202,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_PHI3:
                  {
-                    const int64_t n_embd_head = n_embd / n_head;
-
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  
                      // output
                      output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
  
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = layers[i];
@@ -2223,8 +2226,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
                          layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                      }
                  } break;
              case LLM_ARCH_PHIMOE:
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index ad9ffe66aa749d97ec19ebe7286ed226e63b5575..163ff64f779732cd69d575d0e227f59ace6fa5dd 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -392,6 +392,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                      "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                  };
                  break;
+            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
              default:
                  // default regex for BPE tokenization pre-processing
                  regex_exprs = {
@@ -1592,6 +1599,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
              } else if (
                  tokenizer_pre == "megrez") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else if (
+                tokenizer_pre == "gpt-4o") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
+                clean_spaces = false;
              } else {
                  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
              }
author	Xuan-Son Nguyen <redacted>
	Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)
committer	GitHub <redacted>
	Fri, 28 Feb 2025 11:44:11 +0000 (12:44 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
include/llama.h		patch \| blob \| history
models/ggml-vocab-gpt-4o.gguf.inp	[new file with mode: 0644]	patch \| blob
models/ggml-vocab-gpt-4o.gguf.out	[new file with mode: 0644]	patch \| blob
src/llama-model.cpp		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history