hooks : setting up flake8 and pre-commit hooks (#1681)

author Jiří Podivín <redacted>

Sat, 17 Jun 2023 10:32:48 +0000 (12:32 +0200)

committer GitHub <redacted>

Sat, 17 Jun 2023 10:32:48 +0000 (13:32 +0300)
author Jiří Podivín <redacted>
Sat, 17 Jun 2023 10:32:48 +0000 (12:32 +0200)
committer GitHub <redacted>
Sat, 17 Jun 2023 10:32:48 +0000 (13:32 +0300)
diff --git a/.flake8 b/.flake8

new file mode 100644 (file)

index 0000000..113ca5f
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 125
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml

new file mode 100644 (file)

index 0000000..65796fe
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+exclude: prompts/.*.txt
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v3.2.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-added-large-files
+- repo: https://github.com/PyCQA/flake8
+  rev: 6.0.0
+  hooks:
+  -   id: flake8
diff --git a/convert.py b/convert.py

index ece5a0266836529444a845ecfd2ef64413bd4a7e..265c41fa04b189669059773abc76f9bd347d2116 100644 (file)
--- a/convert.py
+++ b/convert.py
@@ -512,7 +512,11 @@ class LazyTensor:
              if not isinstance(self.data_type, QuantizedDataType):
                  raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
              if self.data_type.have_g_idx:
-                sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML.  For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
+                sys.stderr.write(
+                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
+                    "which is not yet natively supported by GGML. "
+                    "For now you can still convert this model by passing `--outtype f16` to dequantize, "
+                    "but that will result in a much larger output file for no quality benefit.\n")
                  sys.exit(1)
              assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
  
@@ -694,8 +698,9 @@ class LazyUnpickler(pickle.Unpickler):
          description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
          return LazyStorage(load=load, kind=pid[1], description=description)
  
-   # @staticmethod
-    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,  # pyright: ignore[reportSelfClsParameterName]
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
                                 requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
          assert isinstance(storage, LazyStorage)
  
@@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
      # Use mmap for the actual data to avoid race conditions with the file offset.
      off = fp.raw.tell()
      mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    fp.raw.seek(off) # needed on Windows
+    fp.raw.seek(off)  # needed on Windows
  
      def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
          shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
          files = list(path.glob("model-00001-of-*.safetensors"))
          if not files:
              # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
              files = [file for glob in globs for file in path.glob(glob)]
          if not files:
              # Try GGML too, but with lower priority, since if both a non-GGML
@@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
          elif path3.exists():
              path = path3
          else:
-            raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")
      added_tokens_path = path.parent / "added_tokens.json"
      print(f"Loading vocab file {path}")
      return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
      }[params.file_type]
      ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
      if ret in model_paths:
-        sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input.  Please explicitly specify a path using --outfile.\n")
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.\n")
          sys.exit(1)
      return ret
  
@@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
      parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
      parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
      parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("model", type=Path,
+                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
      args = parser.parse_args(args_in)
  
      vocab: Vocab
diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py

index d00b2865263bb17eced5ac1084c1411e607b9196..1b6c54bff73d13096140d1de2cd46cdaefc071d5 100644 (file)
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -1,5 +1,5 @@
  import matplotlib.pyplot as plt
-import sys, os
+import os
  import csv
  
  labels = []
@@ -8,6 +8,7 @@ numEntries = 1
  
  rows = []
  
+
  def bar_chart(numbers, labels, pos):
      plt.bar(pos, numbers, color='blue')
      plt.xticks(ticks=pos, labels=labels)
@@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
      plt.ylabel("Questions Correct")
      plt.show()
  
+
  def calculatecorrect():
      directory = os.fsencode("./examples/jeopardy/results/")
      csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
@@ -38,14 +40,13 @@ def calculatecorrect():
                      print(line)
                  else:
                      print("Correct answer: " + rows[i][2] + "\n")
-                    i+=1
+                    i += 1
                      print("Did the AI get the question right? (y/n)")
                      if input() == "y":
                          totalcorrect += 1
              numbers.append(totalcorrect)
  
  
-
  if __name__ == '__main__':
      calculatecorrect()
      pos = list(range(numEntries))
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py

index 2ce57282607d8f5908b6ab0008dea7f5e44287cd..d127482819f466984535047695f3bf795157061c 100644 (file)
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,9 +1,10 @@
  import os
  import hashlib
  
+
  def sha256sum(file):
      block_size = 16 * 1024 * 1024  # 16 MB block size
-    b  = bytearray(block_size)
+    b = bytearray(block_size)
      file_hash = hashlib.sha256()
      mv = memoryview(b)
      with open(file, 'rb', buffering=0) as f:
@@ -15,6 +16,7 @@ def sha256sum(file):
  
      return file_hash.hexdigest()
  
+
  # Define the path to the llama directory (parent folder of script directory)
  llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
author	Jiří Podivín <redacted>
	Sat, 17 Jun 2023 10:32:48 +0000 (12:32 +0200)
committer	GitHub <redacted>
	Sat, 17 Jun 2023 10:32:48 +0000 (13:32 +0300)
.flake8	[new file with mode: 0644]	patch \| blob
.pre-commit-config.yaml	[new file with mode: 0644]	patch \| blob
convert.py		patch \| blob \| history
examples/jeopardy/graph.py		patch \| blob \| history
scripts/verify-checksum-models.py		patch \| blob \| history