server : check that the prompt fits in the slot's context (#10030)

author Georgi Gerganov <redacted>

Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)

committer GitHub <redacted>

Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)
author Georgi Gerganov <redacted>
Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)
committer GitHub <redacted>
Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 7e552a71b5c7c1ba795c884ab07ac9f45a9b062d..a34dabe235a34caba90eb195990bac06d892e861 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -573,6 +573,9 @@ class Model:
          if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
              # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
              res = "bert-bge"
+        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
+            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
+            res = "bert-bge-large"
          if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
              # ref: https://huggingface.co/mosaicml/mpt-7b
              res = "mpt"
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index 022354a3b624ecdd1fd6da536ebb7f77ec7405d0..28cd02e5a7f663215b75edff37d70198c79e4cc8 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -72,6 +72,7 @@ models = [
      {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
      {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
      {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
      {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
      {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
      {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 58f93694f684655b2a811df21e601b68676fbd67..2821877b2a6fb51b15e4662ab920efbe6c25091b 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1882,12 +1882,17 @@ struct server_context {
                          }
  
                          if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) {
-                            // this prompt is too large to process - discard it
                              if (slot.n_prompt_tokens > n_ubatch) {
                                  slot.release();
                                  send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                  continue;
                              }
+
+                            if (slot.n_prompt_tokens > slot.n_ctx) {
+                                slot.release();
+                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER);
+                                continue;
+                            }
                          } else {
                              if (!params.ctx_shift) {
                                  // if context shift is disabled, we make sure prompt size is smaller than KV size
author	Georgi Gerganov <redacted>
	Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)
committer	GitHub <redacted>
	Fri, 25 Oct 2024 07:13:46 +0000 (10:13 +0300)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history