convert : skip unaccessible HF repos (#7210)

author CrispStrobe <redacted>

Sat, 11 May 2024 08:18:35 +0000 (10:18 +0200)

committer GitHub <redacted>

Sat, 11 May 2024 08:18:35 +0000 (11:18 +0300)
author CrispStrobe <redacted>
Sat, 11 May 2024 08:18:35 +0000 (10:18 +0200)
committer GitHub <redacted>
Sat, 11 May 2024 08:18:35 +0000 (11:18 +0300)
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py

index e757d5ccbc0b4f86dd09adaf93d91ac46320a9bc..cd2674a0ea97d5d71be2f7abeea28ba14644eebb 100755 (executable)
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -145,8 +145,17 @@ for model in models:
      if tokt == TOKENIZER_TYPE.SPM:
          continue
  
+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
      # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+        continue  # Skip to the next model if the tokenizer can't be loaded
  
      chktok = tokenizer.encode(chktxt)
      chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -287,8 +296,17 @@ for model in models:
      name = model["name"]
      tokt = model["tokt"]
  
+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
      # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
+        continue  # Skip this model and continue with the next one in the loop
  
      with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
          for text in tests:
author	CrispStrobe <redacted>
	Sat, 11 May 2024 08:18:35 +0000 (10:18 +0200)
committer	GitHub <redacted>
	Sat, 11 May 2024 08:18:35 +0000 (11:18 +0300)