bert : add tests + fix quantization (#5475)

author Georgi Gerganov <redacted>

Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)

committer GitHub <redacted>

Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)
author Georgi Gerganov <redacted>
Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)
committer GitHub <redacted>
Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)
diff --git a/ci/run.sh b/ci/run.sh

index 82fe247a557460432d420e7ee163590bad6612da..a4264d7754c4bbccab98352e698aaece62dedaf1 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
      #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
  }
  
+# bge-small
+
+function gg_run_embd_bge_small {
+    cd ${SRC}
+
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
+
+    path_models="../models-mnt/bge-small"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert-hf-to-gguf.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+
+    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+
+    set +e
+}
+
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
  ## main
  
  if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
  test $ret -eq 0 && gg_run ctest_release
  
  if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    test $ret -eq 0 && gg_run embd_bge_small
+
      if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
          if [ -z ${GG_BUILD_CUDA} ]; then
              test $ret -eq 0 && gg_run open_llama_3b_v2
diff --git a/llama.cpp b/llama.cpp

index d316d067b9d580d5fe125e164e03599d93dee3df..6dce392df005347398a74fd73a3390da3726e9e9 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -10444,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          quantize &= !params->only_copy;
  
          // do not quantize expert gating tensors
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
+
+        // do not quantize positional embeddings and token types (BERT)
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
  
          enum ggml_type new_type;
          void * new_data;
author	Georgi Gerganov <redacted>
	Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)
committer	GitHub <redacted>
	Tue, 13 Feb 2024 11:01:29 +0000 (13:01 +0200)
ci/run.sh		patch \| blob \| history
llama.cpp		patch \| blob \| history