sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
+
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
+
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
+
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
+
class Vocab:
def __init__(self, load_scores = True):
self.items = []
self.items.append((item_text, item_score))
return offset - orig_offset
+
class Tensor:
def __init__(self, use_padding = True):
self.name = None
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
+
class GGMLModel:
def __init__(self):
self.hyperparameters = None
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
- if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
- GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
+ if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
+ GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
hp.set_n_ff(self)
return offset
+
class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
hp = ggml_model.hyperparameters
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
- use_temp_file = False )
+ use_temp_file = False)
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
if self.special_vocab is not None:
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
- raw_dtype = tensor.dtype )
+ raw_dtype = tensor.dtype)
+
def handle_metadata(cfg, hp):
import convert
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
- cfg.vocabtype )
+ cfg.vocabtype)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
- load_merges = cfg.vocabtype == 'bpe',
- n_vocab = vocab.vocab_size)
+ load_merges = cfg.vocabtype == 'bpe',
+ n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)
+
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True,
- help = 'Input GGMLv3 filename')
+ help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, required = True,
- help ='Output GGUF filename')
+ help ='Output GGUF filename')
parser.add_argument('--name',
- help = 'Set model name')
+ help = 'Set model name')
parser.add_argument('--desc',
- help = 'Set model description')
+ help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1,
- help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+ help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
- help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+ help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
- help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+ help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
- help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+ help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
- help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+ help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
- help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+ help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args()
+
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel()
print('* Scanning GGML input file')
- offset = model.load(data, 0)
+ offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
- converter = GGMLToGGUF(model, data, cfg,
+ converter = GGMLToGGUF(
+ model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
- special_vocab = special_vocab )
+ special_vocab = special_vocab
+ )
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
+
if __name__ == '__main__':
main()
# data types
#
+
@dataclass(frozen=True)
class DataType:
name: str
def elements_to_bytes(self, n_elements: int) -> int:
return n_elements * self.dtype.itemsize
+
@dataclass(frozen=True)
class UnquantizedDataType(DataType):
pass
+
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
+
@dataclass(frozen=True)
class QuantizedDataType(DataType):
block_size: int
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
+
@dataclass(frozen=True)
class Q8_0QuantizedDataType(QuantizedDataType):
# Mini Q8_0 quantization in Python!
n_blocks = arr.size // self.block_size
blocks = arr.reshape((n_blocks, self.block_size))
# Much faster implementation of block quantization contributed by @Cebtenzzre
+
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
d = abs(blocks).max(axis = 1) / np.float32(127)
with np.errstate(divide = 'ignore'):
yield from zip(d, qs)
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
+
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
- dtype = np.dtype(np.float32), valid_conversions = [],
- ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
- quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
+ dtype = np.dtype(np.float32), valid_conversions = [],
+ ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
+ quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
# Quantized types skipped here because they may also map to np.float32
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
# TODO: match this with `llama_ftype`
# TODO: rename to LLAMAFileType
# TODO: move to `gguf.py`
+
+
class GGMLFileType(enum.IntEnum):
AllF32 = 0
MostlyF16 = 1 # except 1d tensors
# 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32
+
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
GGMLFileType.AllF32 : DT_F32,
GGMLFileType.MostlyF16 : DT_F16,
# hparams loading
#
+
@dataclass
class Params:
n_vocab: int
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
- n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+ n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
- n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+ n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else:
- n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+ n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
- if item['content'] not in self.bpe_tokenizer )
+ if item['content'] not in self.bpe_tokenizer)
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.bpe_tokenizer
- from transformers.models.gpt2 import tokenization_gpt2
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
for i, _ in enumerate(tokenizer):
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
#
# TODO: reuse (probably move to gguf.py?)
#
+
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
- #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
+ # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape))
+ .swapaxes(1, 2)
+ .reshape(weights.shape))
class Tensor(metaclass=ABCMeta):
ret = self._load()
# Should be okay if it maps to the same numpy type?
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
- (self.data_type, ret.data_type, self.description)
+ (self.data_type, ret.data_type, self.description)
return ret
def astype(self, data_type: DataType) -> LazyTensor:
return lazy_tensor.load().permute(n_head, n_head_kv)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
In = TypeVar('In')
Out = TypeVar('Out')
+
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than
break
yield result
+
def check_vocab_size(params: Params, vocab: Vocab) -> None:
if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
class OutputFile:
- def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
self.gguf.close()
@staticmethod
- def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+ def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out, endianess=endianess)
of.close()
+
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
- wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+ wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32
raise Exception(f"Unexpected combination of types: {name_to_type}")
+
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()}
+
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
- #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+ # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
return out
+
def nth_multifile_path(path: Path, n: int) -> Path | None:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model.
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
- load_merges = args.vocabtype == 'bpe',
- n_vocab = vocab.vocab_size)
+ load_merges = args.vocabtype == 'bpe',
+ n_vocab = vocab.vocab_size)
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}")
vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
- load_merges = args.vocabtype == 'bpe',
- n_vocab = vocab.vocab_size)
+ load_merges = args.vocabtype == 'bpe',
+ n_vocab = vocab.vocab_size)
model = model_plus.model
model = convert_model_names(model, params)