--- /dev/null
+name: Check Pre-Tokenizer Hashes
+
+on:
+ push:
+ paths:
+ - 'convert_hf_to_gguf.py'
+ - 'convert_hf_to_gguf_update.py'
+ pull_request:
+ paths:
+ - 'convert_hf_to_gguf.py'
+ - 'convert_hf_to_gguf_update.py'
+
+jobs:
+ pre-tokenizer-hashes:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install Python dependencies
+ run: |
+ python3 -m venv .venv
+ .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+ - name: Update pre-tokenizer hashes
+ run: |
+ cp convert_hf_to_gguf.py /tmp
+ .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+ - name: Check if committed pre-tokenizer hashes matches generated version
+ run: |
+ if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+ echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+ echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+ echo "Differences found:"
+ diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+ exit 1
+ fi
+ echo "Model pre-tokenizer hashes are up to date."
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
+parser.add_argument(
+ "--check-missing", action="store_true",
+ help="only check for missing pre-tokenizer hashes",
+)
parser.add_argument(
"hf_token",
help="optional HF token",
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
+if args.check_missing and args.full:
+ logger.warning("Downloading full list of models requested, ignoring --check-missing!")
+ args.check_missing = False
+
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]
-logging.info(f"Downloading {len(models)} models...")
-for model in models:
- try:
- download_model(model)
- except Exception as e:
- logger.error(f"Failed to download model {model['name']}. Error: {e}")
+if not args.check_missing:
+ logging.info(f"Downloading {len(models)} models...")
+ for model in models:
+ try:
+ download_model(model)
+ except Exception as e:
+ logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
-r ./requirements-convert_legacy_llama.txt
---extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
---extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"