]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
gguf_convert_endian.py: implement byteswapping for q4_k and q6_k (#11349)
authorAleksei Nikiforov <redacted>
Mon, 24 Feb 2025 11:27:01 +0000 (12:27 +0100)
committerGitHub <redacted>
Mon, 24 Feb 2025 11:27:01 +0000 (11:27 +0000)
gguf-py/gguf/scripts/gguf_convert_endian.py

index f97e91bd4c26fdb10a89d39a60b50aa53ec05c8d..837831799b31c5f035b575ad83b2c1151d17e0f2 100755 (executable)
@@ -43,6 +43,8 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
             gguf.GGMLQuantizationType.F32,
             gguf.GGMLQuantizationType.F16,
             gguf.GGMLQuantizationType.Q8_0,
+            gguf.GGMLQuantizationType.Q4_K,
+            gguf.GGMLQuantizationType.Q6_K,
         ):
             raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
     logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
@@ -96,6 +98,59 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
                 if block_num % 100000 == 0:
                     inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
 
+        elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
+            # Handle Q4_K tensor blocks (block_q4_k)
+            # Specific handling of block_q4_k is required.
+            # Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
+
+            # first flatten structure
+            newshape = 1
+            for i in tensor.data.shape:
+                newshape *= i
+
+            tensor.data.resize(newshape)
+
+            block_size = 144
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized fields
+                delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
+        elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
+            # Handle Q6_K tensor blocks (block_q6_k)
+            # Specific handling of block_q6_k is required.
+            # Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
+
+            # first flatten structure
+            newshape = 1
+            for i in tensor.data.shape:
+                newshape *= i
+
+            tensor.data.resize(newshape)
+
+            block_size = 210
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized field
+                delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
         else:
             # Handle other tensor types
             tensor.data.byteswap(inplace=True)