*.py: Stylistic adjustments for python (#8233)

author Jiří Podivín <redacted>

Mon, 22 Jul 2024 13:44:53 +0000 (15:44 +0200)

committer GitHub <redacted>

Mon, 22 Jul 2024 13:44:53 +0000 (23:44 +1000)
author Jiří Podivín <redacted>
Mon, 22 Jul 2024 13:44:53 +0000 (15:44 +0200)
committer GitHub <redacted>
Mon, 22 Jul 2024 13:44:53 +0000 (23:44 +1000)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 59410d6ce00ddbfc4f14196176e3cc64d82abc2f..f9e89d8e5219f01715a073b79789e01b31599c50 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -737,7 +737,7 @@ class Model:
                  added_tokens_json = json.load(f)
                  for key in added_tokens_json:
                      token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                          logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                          continue
  
@@ -2005,7 +2005,7 @@ class Phi3MiniModel(Model):
  
                  for key in added_tokens_json:
                      token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                          logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                          continue
  
@@ -2081,7 +2081,7 @@ class Phi3MiniModel(Model):
  
          # write rope scaling for long context (128k) model
          rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if (rope_scaling is None):
+        if rope_scaling is None:
              return
  
          scale = max_pos_embds / orig_max_pos_embds
@@ -2728,7 +2728,7 @@ class JinaBertV2Model(BertModel):
  
              yield name, data
  
-    def set_vocab(self, *args, **kwargs):
+    def set_vocab(self):
          tokenizer_class = 'BertTokenizer'
          with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
              tokenizer_class = json.load(f)['tokenizer_class']
@@ -2876,7 +2876,7 @@ class ArcticModel(Model):
                      added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
                      for token_id, token_json in added_tokens_decoder.items():
                          token_id = int(token_id)
-                        if (token_id >= vocab_size):
+                        if token_id >= vocab_size:
                              logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                              continue
  
@@ -3125,7 +3125,7 @@ class T5Model(Model):
                  added_tokens_json = json.load(f)
                  for key in added_tokens_json:
                      token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                          logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                          continue
  
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index 29942333704ee823a812a8a54a06543189d1ea78..5e58c8409fa0356f23023b38498d0a73663c65f6 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
  
  # TODO: this string has to exercise as much pre-tokenizer functionality as possible
  #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
  
  if len(sys.argv) == 2:
      token = sys.argv[1]
@@ -100,8 +100,8 @@ def download_file_with_auth(url, token, save_path):
      response = sess.get(url, headers=headers)
      response.raise_for_status()
      os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
+    with open(save_path, 'wb') as downloaded_file:
+        downloaded_file.write(response.content)
      logger.info(f"File {save_path} downloaded successfully")
  
  
@@ -160,7 +160,7 @@ for model in models:
          logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
          continue  # Skip to the next model if the tokenizer can't be loaded
  
-    chktok = tokenizer.encode(chktxt)
+    chktok = tokenizer.encode(CHK_TXT)
      chkhsh = sha256(str(chktok).encode()).hexdigest()
  
      logger.info(f"model: {name}")
@@ -192,7 +192,7 @@ src_func = f"""
          # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
          # use in llama.cpp to implement the same pre-tokenizer
  
-        chktxt = {repr(chktxt)}
+        chktxt = {repr(CHK_TXT)}
  
          chktok = tokenizer.encode(chktxt)
          chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -288,7 +288,7 @@ tests = [
      "333333333",
      "Cửa Việt", # llama-bpe fails on this
      " discards",
-    chktxt,
+    CHK_TXT,
  ]
  
  # write the tests to ./models/ggml-vocab-{name}.gguf.inp
diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py

index 95ea831a50a9c0da1b72c87342a68524c54cb883..7b00b4398178b629d62be86c9d8706c1a7023423 100755 (executable)
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -132,6 +132,10 @@ class Tensor:
  
  
  class GGMLModel:
+
+    file_format: GGMLFormat
+    format_version: int
+
      def __init__(self):
          self.hyperparameters = None
          self.vocab = None
@@ -290,7 +294,7 @@ class GGMLToGGUF:
          if self.vocab_override is not None:
              vo = self.vocab_override
              logger.info('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                  tokens.append(vbytes)
                  scores.append(score)
                  toktypes.append(ttype)
author	Jiří Podivín <redacted>
	Mon, 22 Jul 2024 13:44:53 +0000 (15:44 +0200)
committer	GitHub <redacted>
	Mon, 22 Jul 2024 13:44:53 +0000 (23:44 +1000)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
convert_llama_ggml_to_gguf.py		patch \| blob \| history