From: Aleksei Nikiforov Date: Thu, 28 Aug 2025 08:56:41 +0000 (+0200) Subject: gguf-py: byteswapping improvements (#12851) X-Git-Tag: upstream/0.0.6527~221 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=64387f6e95434b393ac3df285864692b7fd9c4d2;p=pkg%2Fggml%2Fsources%2Fllama.cpp gguf-py: byteswapping improvements (#12851) * gguf-py: implement byteswapping for Q4_0 This is needed to byteswap Mistral model. Also restore original shapes after byteswapping tensors. It is not needed at the moment, but do it in case they'd be used in future. * Rework byteswapping code in gguf-py Move out details from byteswapping tensor blocks code --- diff --git a/gguf-py/gguf/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py index 0e0febaa..211a3f53 100755 --- a/gguf-py/gguf/scripts/gguf_convert_endian.py +++ b/gguf-py/gguf/scripts/gguf_convert_endian.py @@ -19,6 +19,61 @@ import gguf logger = logging.getLogger("gguf-convert-endian") +def byteswap_q4_0(tensor, block_offs): + # Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations. + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q8_0(tensor, block_offs): + # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q4_k(tensor, block_offs): + # Each block_q4_k consists of 2 f16 values followed by 140 int8 values. + + # Byte-Swap f16 sized fields + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q6_k(tensor, block_offs): + # Each block_q6_k consists of 208 int8 values followed by 1 f16 value. + + # Byte-Swap f16 sized field + delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +byteswap_tensors = { + gguf.GGMLQuantizationType.Q4_0: { + "block_size": 18, # 18 bytes = + 16 * + "byteswap_func": byteswap_q4_0, + }, + gguf.GGMLQuantizationType.Q8_0: { + "block_size": 34, # 34 bytes = + 32 * + "byteswap_func": byteswap_q8_0, + }, + gguf.GGMLQuantizationType.Q4_K: { + "block_size": 144, # 144 bytes = 2 * + 140 * + "byteswap_func": byteswap_q4_k, + }, + gguf.GGMLQuantizationType.Q6_K: { + "block_size": 210, # 210 bytes = + 208 * + "byteswap_func": byteswap_q6_k, + }, +} + + def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: file_endian = reader.endianess.name if reader.byte_order == 'S': @@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None sys.exit(0) logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: - if tensor.tensor_type not in ( - gguf.GGMLQuantizationType.F32, - gguf.GGMLQuantizationType.F16, - gguf.GGMLQuantizationType.Q8_0, - gguf.GGMLQuantizationType.Q4_K, - gguf.GGMLQuantizationType.Q6_K, - ): + if tensor.tensor_type not in byteswap_tensors and \ + tensor.tensor_type not in ( + gguf.GGMLQuantizationType.F32, + gguf.GGMLQuantizationType.F16, + ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") logger.info(f"* Preparing to convert from {file_endian} to {order}") if args.dry_run: @@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None part.byteswap(inplace=True) # Byte-swap tensor data if necessary - if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0: - # Handle Q8_0 tensor blocks (block_q8_0) - # Specific handling of block_q8_0 is required. - # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. - - block_size = 34 # 34 bytes = + 32 * - - n_blocks = len(tensor.data) // block_size - for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): - block_offs = block_num * block_size - - # Byte-Swap f16 sized delta field - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - # Byte-Swap Q8 weights - if block_num % 100000 == 0: - inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") - - elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K: - # Handle Q4_K tensor blocks (block_q4_k) - # Specific handling of block_q4_k is required. - # Each block_q4_k consists of 2 f16 values followed by 140 int8 values. - + if tensor.tensor_type in byteswap_tensors: # first flatten structure + oldshape = tensor.data.shape newshape = 1 for i in tensor.data.shape: newshape *= i tensor.data.resize(newshape) - block_size = 144 - n_blocks = len(tensor.data) // block_size - for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): - block_offs = block_num * block_size - - # Byte-Swap f16 sized fields - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - # Byte-Swap - if block_num % 100000 == 0: - inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") - - elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K: - # Handle Q6_K tensor blocks (block_q6_k) - # Specific handling of block_q6_k is required. - # Each block_q6_k consists of 208 int8 values followed by 1 f16 value. - - # first flatten structure - newshape = 1 - for i in tensor.data.shape: - newshape *= i - - tensor.data.resize(newshape) + block_size = byteswap_tensors[tensor.tensor_type]["block_size"] + byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"] - block_size = 210 n_blocks = len(tensor.data) // block_size for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): block_offs = block_num * block_size - # Byte-Swap f16 sized field - delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16) - delta.byteswap(inplace=True) + byteswap_func(tensor, block_offs) - # Byte-Swap if block_num % 100000 == 0: inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + # restore old shape in case it's ever used + tensor.data.resize(oldshape) else: # Handle other tensor types tensor.data.byteswap(inplace=True)