gguf-py : fix and simplify quantized shape round-trip (#7483)

author compilade <redacted>

Sat, 25 May 2024 01:11:48 +0000 (21:11 -0400)

committer GitHub <redacted>

Sat, 25 May 2024 01:11:48 +0000 (11:11 +1000)
author compilade <redacted>
Sat, 25 May 2024 01:11:48 +0000 (21:11 -0400)
committer GitHub <redacted>
Sat, 25 May 2024 01:11:48 +0000 (11:11 +1000)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index 998877c26da1966b11ec3446178e64b20c47b575..51549ac72f8e71d136555a47bd2a9103fdcf27fa 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -313,11 +313,10 @@ class Model:
                          data = data.astype(np.float32)
                      data_qtype = gguf.GGMLQuantizationType.F32
  
-                block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
+                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
+
                  # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"""{{{', '.join(str(n) for n in reversed(
-                    (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
-                )}}}"""
+                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
  
                  # n_dims is implicit in the shape
                  logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py

index 21b089f8a29371edad360c9209139c8c3d54c02d..e48bc00c388c8078a43352a2a54fe4a68eb6fe47 100644 (file)
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
  import numpy as np
  import numpy.typing as npt
  
+from .quants import quant_shape_to_byte_shape
+
  if __name__ == "__main__":
      import sys
      from pathlib import Path
@@ -251,6 +253,7 @@ class GGUFReader:
              tensor_names.add(tensor_name)
              ggml_type = GGMLQuantizationType(raw_dtype[0])
              n_elems = int(np.prod(dims))
+            np_dims = tuple(reversed(dims.tolist()))
              block_size, type_size = GGML_QUANT_SIZES[ggml_type]
              n_bytes = n_elems * type_size // block_size
              data_offs = int(start_offs + offset_tensor[0])
@@ -279,6 +282,7 @@ class GGUFReader:
              else:
                  item_count = n_bytes
                  item_type = np.uint8
+                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
              tensors.append(ReaderTensor(
                  name = tensor_name,
                  tensor_type = ggml_type,
@@ -286,7 +290,7 @@ class GGUFReader:
                  n_elements = n_elems,
                  n_bytes = n_bytes,
                  data_offset = data_offs,
-                data = self._get(data_offs, item_type, item_count),
+                data = self._get(data_offs, item_type, item_count).reshape(np_dims),
                  field = field,
              ))
          self.tensors = tensors
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py

index 8b41b54eaa5a67654875ff1099088c14dbd31ac1..c194dd5dd1e65be6eaeaa372fd7075852ea70f37 100644 (file)
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -13,7 +13,6 @@ from string import ascii_letters, digits
  import numpy as np
  
  from .constants import (
-    GGML_QUANT_SIZES,
      GGUF_DEFAULT_ALIGNMENT,
      GGUF_MAGIC,
      GGUF_VERSION,
@@ -26,6 +25,8 @@ from .constants import (
      TokenType,
  )
  
+from .quants import quant_shape_from_byte_shape
+
  logger = logging.getLogger(__name__)
  
  
@@ -229,10 +230,7 @@ class GGUFWriter:
          else:
              dtype = raw_dtype
              if tensor_dtype == np.uint8:
-                block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
-                if tensor_shape[-1] % type_size != 0:
-                    raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
-                tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
+                tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
          n_dims = len(tensor_shape)
          self.ti_data += self._pack("I", n_dims)
          for i in range(n_dims):
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py

index e7fc0eae3f64bb27ef5f6575bca719c704f08130..b22eec1661ce70461b5cc435ab6c896ce292c699 100644 (file)
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -1,5 +1,5 @@
  from __future__ import annotations
-from typing import Callable
+from typing import Callable, Sequence
  
  from numpy.typing import DTypeLike
  
@@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
  import numpy as np
  
  
+def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % block_size != 0:
+        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
+    return (*shape[:-1], shape[-1] // block_size * type_size)
+
+
+def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % type_size != 0:
+        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
+    return (*shape[:-1], shape[-1] // type_size * block_size)
+
+
  # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
  def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
      n = n.astype(np.float32, copy=False).view(np.int32)
diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py

index 63d3c5d8fdcf452d5125bab377f046189fd20d28..c9f1927f6a0be0f2dce881c1f25f81c822bd94a0 100755 (executable)
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
  
      for tensor in reader.tensors:
          total_bytes += tensor.n_bytes
-        # Dimensions are written in reverse order, so flip them first
-        shape = np.flipud(tensor.shape).tolist()
-        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
+        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
  
      bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
author	compilade <redacted>
	Sat, 25 May 2024 01:11:48 +0000 (21:11 -0400)
committer	GitHub <redacted>
	Sat, 25 May 2024 01:11:48 +0000 (11:11 +1000)
convert-hf-to-gguf.py		patch \| blob \| history
gguf-py/gguf/gguf_reader.py		patch \| blob \| history
gguf-py/gguf/gguf_writer.py		patch \| blob \| history
gguf-py/gguf/quants.py		patch \| blob \| history
gguf-py/scripts/gguf-new-metadata.py		patch \| blob \| history