ci : check that pre-tokenizer hashes are up-to-date (#15032)

author Sigbjørn Skjæret <redacted>

Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)

committer GitHub <redacted>

Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)
author Sigbjørn Skjæret <redacted>
Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)
committer GitHub <redacted>
Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml

new file mode 100644 (file)

index 0000000..dff998e
--- /dev/null
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -0,0 +1,45 @@
+name: Check Pre-Tokenizer Hashes
+
+on:
+    push:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+    pull_request:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+
+jobs:
+    pre-tokenizer-hashes:
+        runs-on: ubuntu-latest
+
+        steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+
+        - name: Set up Python
+          uses: actions/setup-python@v5
+          with:
+              python-version: '3.11'
+
+        - name: Install Python dependencies
+          run: |
+              python3 -m venv .venv
+              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+        - name: Update pre-tokenizer hashes
+          run: |
+              cp convert_hf_to_gguf.py /tmp
+              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+        - name: Check if committed pre-tokenizer hashes matches generated version
+          run: |
+              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+                  echo "Differences found:"
+                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  exit 1
+              fi
+              echo "Model pre-tokenizer hashes are up to date."
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index 5e21c1f47f18c4938756957ca5e3dc9b6677f771..211b81ff34088c7ba154c10a36111ec9588ff20e 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -59,6 +59,10 @@ parser.add_argument(
      "--full", action="store_true",
      help="download full list of models - make sure you have access to all of them",
  )
+parser.add_argument(
+    "--check-missing", action="store_true",
+    help="only check for missing pre-tokenizer hashes",
+)
  parser.add_argument(
      "hf_token",
      help="optional HF token",
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
  if hf_token is None:
      logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
  
+if args.check_missing and args.full:
+    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
+    args.check_missing = False
+
  # TODO: this string has to exercise as much pre-tokenizer functionality as possible
  #       will be updated with time - contributions welcome
  CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -222,12 +230,13 @@ if not args.full:
      all_models = models.copy()
      models = [model for model in all_models if model["name"] not in existing_models]
  
-logging.info(f"Downloading {len(models)} models...")
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+if not args.check_missing:
+    logging.info(f"Downloading {len(models)} models...")
+    for model in models:
+        try:
+            download_model(model)
+        except Exception as e:
+            logger.error(f"Failed to download model {model['name']}. Error: {e}")
  
  
  # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt

index 431c596c12354db591694aa618fb81d769444c3f..afe2747d448d4d896c78c52ec5f40560f09b09f9 100644 (file)
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,7 +1 @@
  -r ./requirements-convert_legacy_llama.txt
---extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
---extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"
author	Sigbjørn Skjæret <redacted>
	Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)
committer	GitHub <redacted>
	Sat, 2 Aug 2025 12:39:01 +0000 (14:39 +0200)
.github/workflows/pre-tokenizer-hashes.yml	[new file with mode: 0644]	patch \| blob
convert_hf_to_gguf_update.py		patch \| blob \| history
requirements/requirements-convert_hf_to_gguf_update.txt		patch \| blob \| history