gguf : support big endian platform (#3552)

author Qin Yue Chen <redacted>

Fri, 20 Oct 2023 11:19:40 +0000 (06:19 -0500)

committer GitHub <redacted>

Fri, 20 Oct 2023 11:19:40 +0000 (14:19 +0300)
author Qin Yue Chen <redacted>
Fri, 20 Oct 2023 11:19:40 +0000 (06:19 -0500)
committer GitHub <redacted>
Fri, 20 Oct 2023 11:19:40 +0000 (14:19 +0300)
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py

index 513a7516a25f08b8830e3124608ee1748a3bd283..a1783f71fb668970772f945d68b8f95e9df6cc23 100755 (executable)
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
          "ftype", type=int, choices=[0, 1], default=1, nargs='?',
          help="output format - use 0 for float32, 1 for float16",
      )
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
      return parser.parse_args()
  
  args = parse_args()
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
      print(f'Error: {args.model} is not a directory', file = sys.stderr)
      sys.exit(1)
  
+endianess = gguf.GGUFEndian.LITTLE
+if args.bigendian:
+    endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
+print(f"gguf: Conversion Endianess {endianess}")
  # possible tensor data types
  #   ftype == 0 -> float32
  #   ftype == 1 -> float16
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
  num_parts = count_model_parts(dir_model)
  print(f"num_parts:{num_parts}\n")
  ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
  
  print("gguf: get model metadata")
  
diff --git a/convert.py b/convert.py

index e9b08d344f5bd5ff182341cf74b3f486afc8257d..24da25efcaca13756c36fd8a5bce6301210092a1 100755 (executable)
--- a/convert.py
+++ b/convert.py
@@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
  
  
  class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
  
      def add_meta_arch(self, params: Params) -> None:
          name = "LLaMA"
@@ -875,10 +875,10 @@ class OutputFile:
          self.gguf.close()
  
      @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
          check_vocab_size(params, vocab)
  
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
  
          # meta data
          of.add_meta_arch(params)
@@ -903,10 +903,10 @@ class OutputFile:
          return dt.quantize(arr)
  
      @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
          check_vocab_size(params, vocab)
  
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
  
          # meta data
          of.add_meta_arch(params)
@@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
      parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
      parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
      parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    args = parser.parse_args(args_in)
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
  
+    args = parser.parse_args(args_in)
      if args.dump_single:
          model_plus = lazy_load_file(args.model)
          do_dump_model(model_plus)
@@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
      if args.dump:
          do_dump_model(model_plus)
          return
+    endianess = gguf.GGUFEndian.LITTLE
+    if args.bigendian:
+        endianess = gguf.GGUFEndian.BIG
  
      params = Params.load(model_plus)
      if params.n_ctx == -1:
@@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
      params.ftype = ftype
      print(f"Writing {outfile}, format {ftype}")
  
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
      print(f"Wrote {outfile}")
  
  
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

index c291f0adf20e18bbb09232b0388dd16f1574ce90..cae3bf3c3dc653ae8f6f703a8b8d9c60401c5805 100644 (file)
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
      if (file.size < 4) {
          return false;
      }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
      return magic == GGUF_MAGIC;
  }
  
diff --git a/ggml.c b/ggml.c

index ed157aab0993022eb8fc416d58ff9160f5a5ea46..49f3b7aba31f5651719b4da4dddf43b1ab916efd 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -20845,7 +20845,7 @@ struct gguf_kv {
  };
  
  struct gguf_header {
-    uint32_t magic;
+    char magic[4];
      uint32_t version;
      uint64_t n_tensors; // GGUFv2
      uint64_t n_kv;      // GGUFv2
@@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
  struct gguf_context * gguf_init_empty(void) {
      struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
  
-    ctx->header.magic     = GGUF_MAGIC;
+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
      ctx->header.version   = GGUF_VERSION;
      ctx->header.n_tensors = 0;
      ctx->header.n_kv      = 0;
@@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      // offset from start of file
      size_t offset = 0;
  
-    uint32_t magic = 0;
+    char magic[4];
  
      // check the magic before making allocations
      {
          gguf_fread_el(file, &magic, sizeof(magic), &offset);
  
-        if (magic != GGUF_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-            fclose(file);
-            return NULL;
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
+            if (magic[i] != GGUF_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fclose(file);
+                return NULL;
+            }
          }
      }
  
@@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
  
      // read the header
      {
-        ctx->header.magic = magic;
+        strncpy(ctx->header.magic, magic, 4);
+
  
          ctx->kv    = NULL;
          ctx->infos = NULL;
diff --git a/ggml.h b/ggml.h

index 6e35888e9cf8639713dfdb45546dca854f8f0f6b..16aaf169ee8fda32c34ad1aed9b5834dc26daf1b 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -231,8 +231,9 @@
  #define GGML_EXIT_SUCCESS 0
  #define GGML_EXIT_ABORTED 1
  
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
  
  #define GGUF_DEFAULT_ALIGNMENT 32
  
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py

index 557ce7ac0173c036f7b6e83ef0e4ae63f8379b4f..072c839c401d5ac80f3161f70ccaafd1f86c2b18 100644 (file)
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -19,9 +19,10 @@ import numpy as np
  #
  
  GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 2
+GGUF_VERSION           = 3
  GGUF_DEFAULT_ALIGNMENT = 32
  
+
  # general
  KEY_GENERAL_ARCHITECTURE         = "general.architecture"
  KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
      Q6_K = 14
      Q8_K = 15
  
+class GGUFEndian(IntEnum):
+    LITTLE = 0
+    BIG = 1
+
  
  class GGUFValueType(IntEnum):
      UINT8   = 0
@@ -644,18 +649,41 @@ class GGUFWriter:
      temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
      tensors: list[tuple[np.ndarray[Any, Any], int]]
  
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    @property
+    def pack_prefix(self):
+        if self.endianess==GGUFEndian.LITTLE:
+            return "<"
+        else:
+            return ">"
+
+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
          self.fout = open(path, "wb")
          self.arch = arch
+        self.endianess = endianess
+        self._simple_value_packing = {
+            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
+            GGUFValueType.INT8:    f"{self.pack_prefix}b",
+            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
+            GGUFValueType.INT16:   f"{self.pack_prefix}h",
+            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
+            GGUFValueType.INT32:   f"{self.pack_prefix}i",
+            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
+            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
+            GGUFValueType.INT64:   f"{self.pack_prefix}q",
+            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
+            GGUFValueType.BOOL:    "?" ,
+        }
          self.add_architecture()
          self.use_temp_file = use_temp_file
          self.tensors = []
+        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
+        print(f"This gguf file is for {endianess_str} only")
  
      def write_header_to_file(self):
          self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
          self.flush()
  #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
  
@@ -727,25 +755,12 @@ class GGUFWriter:
          self.add_key(key)
          self.add_val(val, GGUFValueType.ARRAY)
  
-    _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
-        GGUFValueType.BOOL:    "?" ,
-    }
      def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
          if vtype is None:
              vtype = GGUFValueType.get_type(val)
  
          if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
              self.kv_data_count += 1
  
          pack_fmt = self._simple_value_packing.get(vtype)
@@ -753,14 +768,14 @@ class GGUFWriter:
              self.kv_data += struct.pack(pack_fmt, val)
          elif vtype == GGUFValueType.STRING:
              encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
              self.kv_data += encoded_val
          elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
              ltype = GGUFValueType.get_type(val[0])
              if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                  raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
              for item in val:
                  self.add_val(item, add_vtype=False)
          else:
@@ -774,22 +789,24 @@ class GGUFWriter:
          assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
  
          encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
          self.ti_data += encoded_name
          n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
          for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
          if raw_dtype is None:
              dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
          else:
              dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
          self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
          self.ti_data_count += 1
  
      def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
          if self.use_temp_file and self.temp_file is None:
              fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
              fp.seek(0)
@@ -815,6 +832,8 @@ class GGUFWriter:
              fp.write(bytes([0] * pad))
  
      def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        if self.endianess==GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
          self.write_padding(self.fout, self.fout.tell())
          tensor.tofile(self.fout)
          self.write_padding(self.fout, tensor.nbytes)
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml

index 07a7ab4dd11fc6f108ce0c77f91365210df9d507..f0741a7c23e034061470dd87dcb7e40c43310ff4 100644 (file)
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
  [tool.poetry]
  name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
  description = "Write ML models in GGUF for GGML"
  authors = ["GGML <ggml@ggml.ai>"]
  packages = [
diff --git a/k_quants.c b/k_quants.c

index e168a87bb25ed112bedce0d82d125585c662dc39..801941fbee075a845e1bc1c5da186949fe78137f 100644 (file)
--- a/k_quants.c
+++ b/k_quants.c
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <intrin.h>
  #else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
  #include <immintrin.h>
  #endif
  #endif
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp

index b506f273fee9fcff953743928f9a193991930144..afd7bf77fcb552472e594d4bce4b72b278a3f8a9 100644 (file)
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
  
  #undef NDEBUG
  #include <cassert>
+#if !defined(__riscv) && !defined(__s390__)
  #include <immintrin.h>
+#endif
  #include <cmath>
  #include <cstdint>
  #include <cstring>
author	Qin Yue Chen <redacted>
	Fri, 20 Oct 2023 11:19:40 +0000 (06:19 -0500)
committer	GitHub <redacted>
	Fri, 20 Oct 2023 11:19:40 +0000 (14:19 +0300)
convert-baichuan-hf-to-gguf.py		patch \| blob \| history
convert.py		patch \| blob \| history
examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp		patch \| blob \| history
ggml.c		patch \| blob \| history
ggml.h		patch \| blob \| history
gguf-py/gguf/gguf.py		patch \| blob \| history
gguf-py/pyproject.toml		patch \| blob \| history
k_quants.c		patch \| blob \| history
tests/test-double-float.cpp		patch \| blob \| history