From: ymcki Date: Sat, 3 May 2025 15:39:51 +0000 (+0800) Subject: llama : Llama-3_1-Nemotron-Ultra-253B-v1 support (#12843) X-Git-Tag: upstream/0.0.5318~48 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=3bf785f3efa89ed28294fbf73054558a2b034bfb;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : Llama-3_1-Nemotron-Ultra-253B-v1 support (#12843) --- diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0862099d..83899953 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2123,6 +2123,9 @@ class DeciModel(TextModel): # if n_heads_in_group is not None, then # _num_kv_heads[il] is num_attention_head // n_heads_in_group and # _num_heads[il] is num_attention_head + # ***dummy layer*** for nemotron 253B + # if n_heads_in_group is None and ffn_mult is None + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 for il in range(len(_block_configs)): if _block_configs[il]["attention"]["n_heads_in_group"] is None: if _block_configs[il]["attention"]["replace_with_linear"] is True: @@ -2134,7 +2137,10 @@ class DeciModel(TextModel): else: self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) self._num_heads.append(self.hparams["num_attention_heads"]) - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer + _ffn_multipliers.append(0.0) + else: + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(_ffn_multipliers) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 08d21301..8d25070f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_236B: return "236B"; case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; + case LLM_TYPE_405B: return "405B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; case LLM_TYPE_MEDIUM: return "0.4B"; @@ -582,6 +583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { switch (hparams.n_layer) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; + case 162: type = LLM_TYPE_405B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -1848,7 +1850,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + if (n_ff > 0) { + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + } if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); @@ -1858,9 +1862,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + if (n_ff > 0) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } // optional MLP bias layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); @@ -4705,6 +4711,7 @@ struct llm_build_deci : public llm_graph_context { ggml_tensor * inpSA = inpL; const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_head = hparams.n_head(il); + const int64_t n_ff = hparams.n_ff(il); if (n_head == 0) { // attention-free layer of Llama-3_1-Nemotron-51B @@ -4780,6 +4787,11 @@ struct llm_build_deci : public llm_graph_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B + if (n_head == 0 && n_ff == 0) { + continue; + } + // For Granite architecture if (hparams.f_residual_scale) { cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); diff --git a/src/llama-model.h b/src/llama-model.h index 4c7e7a33..815fa11e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -76,6 +76,7 @@ enum llm_type { LLM_TYPE_236B, LLM_TYPE_290B, LLM_TYPE_314B, + LLM_TYPE_405B, LLM_TYPE_671B, LLM_TYPE_SMALL, LLM_TYPE_MEDIUM,