+++ /dev/null
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import logging
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-logger = logging.getLogger("persimmon-to-gguf")
-
-
-def _flatten_dict(dct, tensors, prefix=None):
- assert isinstance(dct, dict)
- for key in dct.keys():
- new_prefix = prefix + '.' + key if prefix is not None else key
- if isinstance(dct[key], torch.Tensor):
- tensors[new_prefix] = dct[key]
- elif isinstance(dct[key], dict):
- _flatten_dict(dct[key], tensors, new_prefix)
- else:
- raise ValueError(type(dct[key]))
- return None
-
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
- tokenizer_path = dir_model / 'adept_vocab.model'
- logger.info('getting sentencepiece tokenizer from', tokenizer_path)
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
- logger.info('adding tokens')
- tokens: list[bytes] = []
- scores: list[float] = []
- toktypes: list[int] = []
-
- for i in range(tokenizer.vocab_size()):
- text: bytes
- score: float
-
- piece = tokenizer.id_to_piece(i)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(i)
-
- toktype = 1
- if tokenizer.is_unknown(i):
- toktype = 2
- if tokenizer.is_control(i):
- toktype = 3
- if tokenizer.is_unused(i):
- toktype = 5
- if tokenizer.is_byte(i):
- toktype = 6
-
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
- pass
- return tokens, scores, toktypes
-
-
-def main():
- parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
- parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
- parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
- parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
- args = parser.parse_args()
- logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
- sys.path.append(str(args.adept_inference_dir))
- persimmon_model = torch.load(args.ckpt_path)
- hparams = persimmon_model['args']
- pprint(hparams)
- tensors: dict[str, torch.Tensor] = {}
- _flatten_dict(persimmon_model['model'], tensors, None)
-
- arch = gguf.MODEL_ARCH.PERSIMMON
- gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
- block_count = hparams.num_layers
- head_count = hparams.num_attention_heads
- head_count_kv = head_count
- ctx_length = hparams.seq_length
- hidden_size = hparams.hidden_size
-
- gguf_writer.add_name('persimmon-8b-chat')
- gguf_writer.add_context_length(ctx_length)
- gguf_writer.add_embedding_length(hidden_size)
- gguf_writer.add_block_count(block_count)
- gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
- gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
- gguf_writer.add_head_count(head_count)
- gguf_writer.add_head_count_kv(head_count_kv)
- gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
- gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
- tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
- gguf_writer.add_tokenizer_model('llama')
- gguf_writer.add_tokenizer_pre('default')
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- gguf_writer.add_token_types(toktypes)
- gguf_writer.add_bos_token_id(71013)
- gguf_writer.add_eos_token_id(71013)
-
- tensor_map = gguf.get_tensor_name_map(arch, block_count)
- logger.info(tensor_map)
- for name in tensors.keys():
- data_torch = tensors[name]
- if name.endswith(".self_attention.rotary_emb.inv_freq"):
- continue
- old_dtype = data_torch.dtype
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
- data = data_torch.to(torch.float32).squeeze().numpy()
- new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
- if new_name is None:
- raise ValueError(f"Can not map tensor '{name}'")
-
- n_dims = len(data.shape)
- logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
- gguf_writer.add_tensor(new_name, data)
- logger.info("gguf: write header")
- gguf_writer.write_header_to_file()
- logger.info("gguf: write metadata")
- gguf_writer.write_kv_data_to_file()
- logger.info("gguf: write tensors")
- gguf_writer.write_tensors_to_file()
-
- gguf_writer.close()
-
- logger.info(f"gguf: model successfully exported to '{args.outfile}'")
-
-
-if __name__ == '__main__':
- main()
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
- LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
- { LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
- {
- LLM_ARCH_PERSIMMON,
- {
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
- { LLM_TENSOR_OUTPUT, "output"},
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
- },
- },
{
LLM_ARCH_MPT,
{
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
- case LLM_ARCH_PERSIMMON:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 36: model.type = e_model::MODEL_8B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
- }
- } break;
case LLM_ARCH_REFACT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
- case LLM_ARCH_PERSIMMON:
- {
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
- {
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
- }
-
- for (int i = 0; i < n_layer; ++i) {
- ggml_context * ctx_layer = ctx_for_layer(i);
- ggml_context * ctx_split = ctx_for_layer_split(i);
-
- auto & layer = model.layers[i];
-
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
-
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
-
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
-
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
-
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
-
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
-
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
-
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
- }
- } break;
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
{
return gf;
}
- struct ggml_cgraph * build_persimmon() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
-
- struct ggml_tensor * cur;
- struct ggml_tensor * inpL;
-
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
-
- // inp_pos - contains the positions
- struct ggml_tensor * inp_pos = build_inp_pos();
-
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * residual = inpL;
-
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, cb, il);
- cb(cur, "attn_norm", il);
-
- // self attention
- {
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- // split qkv
- GGML_ASSERT(n_head_kv == n_head);
-
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
- cb(tmpqkv, "tmpqkv", il);
-
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
- cb(tmpqkv_perm, "tmpqkv", il);
-
- struct ggml_tensor * tmpq = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- 0
- );
- cb(tmpq, "tmpq", il);
-
- struct ggml_tensor * tmpk = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
- );
- cb(tmpk, "tmpk", il);
-
- // Q/K Layernorm
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, cb, il);
- cb(tmpq, "tmpq", il);
-
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, cb, il);
- cb(tmpk, "tmpk", il);
-
- // RoPE the first n_rot of q/k, pass the other half, and concat.
- struct ggml_tensor * qrot = ggml_view_3d(
- ctx0, tmpq, n_rot, n_head, n_tokens,
- ggml_element_size(tmpq) * n_embd_head,
- ggml_element_size(tmpq) * n_embd_head * n_head,
- 0
- );
- cb(qrot, "qrot", il);
-
- struct ggml_tensor * krot = ggml_view_3d(
- ctx0, tmpk, n_rot, n_head, n_tokens,
- ggml_element_size(tmpk) * n_embd_head,
- ggml_element_size(tmpk) * n_embd_head * n_head,
- 0
- );
- cb(krot, "krot", il);
-
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
- struct ggml_tensor * qpass = ggml_view_3d(
- ctx0, tmpq, n_rot, n_head, n_tokens,
- ggml_element_size(tmpq) * n_embd_head,
- ggml_element_size(tmpq) * n_embd_head * n_head,
- ggml_element_size(tmpq) * n_rot
- );
- cb(qpass, "qpass", il);
-
- struct ggml_tensor * kpass = ggml_view_3d(
- ctx0, tmpk, n_rot, n_head, n_tokens,
- ggml_element_size(tmpk) * n_embd_head,
- ggml_element_size(tmpk) * n_embd_head * n_head,
- ggml_element_size(tmpk) * n_rot
- );
- cb(kpass, "kpass", il);
-
- struct ggml_tensor * qrotated = ggml_rope_custom(
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(qrotated, "qrotated", il);
-
- struct ggml_tensor * krotated = ggml_rope_custom(
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(krotated, "krotated", il);
-
- // ggml currently only supports concatenation on dim=2
- // so we need to permute qrot, qpass, concat, then permute back.
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
- cb(qrotated, "qrotated", il);
-
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
- cb(krotated, "krotated", il);
-
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
- cb(qpass, "qpass", il);
-
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
- cb(kpass, "kpass", il);
-
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
- cb(Qcur, "Qcur", il);
-
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
- cb(Kcur, "Kcur", il);
-
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
- cb(Q, "Q", il);
-
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
- cb(Kcur, "Kcur", il);
-
- struct ggml_tensor * Vcur = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
- );
- cb(Vcur, "Vcur", il);
-
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
- model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
- }
-
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- }
-
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, cb, il);
- cb(cur, "ffn_norm", il);
-
- cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
-
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, cb, -1);
- cb(cur, "result_norm", -1);
-
- cur = ggml_mul_mat(ctx0, model.output, cur);
- cb(cur, "result_output", -1);
-
- ggml_build_forward_expand(gf, cur);
-
- return gf;
- }
-
struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
{
result = llm.build_starcoder();
} break;
- case LLM_ARCH_PERSIMMON:
- {
- result = llm.build_persimmon();
- } break;
case LLM_ARCH_REFACT:
{
result = llm.build_refact();
case LLM_ARCH_FALCON:
case LLM_ARCH_GROK:
case LLM_ARCH_DBRX:
- case LLM_ARCH_PERSIMMON:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_STABLELM: