vocab : prevent tokenizer overflow (#14301)

author Ruikai Peng <redacted>

Fri, 20 Jun 2025 14:13:06 +0000 (22:13 +0800)

committer GitHub <redacted>

Fri, 20 Jun 2025 14:13:06 +0000 (07:13 -0700)
author Ruikai Peng <redacted>
Fri, 20 Jun 2025 14:13:06 +0000 (22:13 +0800)
committer GitHub <redacted>
Fri, 20 Jun 2025 14:13:06 +0000 (07:13 -0700)
diff --git a/common/common.cpp b/common/common.cpp

index c2c94e7ae6c0892fcf8fa0ff66c537605279bee0..e4e71ad13fb59ed4e3fb48a2952d6ae24fc02925 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
      int n_tokens = text.length() + 2 * add_special;
      std::vector<llama_token> result(n_tokens);
      n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
      if (n_tokens < 0) {
          result.resize(-n_tokens);
          int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
diff --git a/include/llama.h b/include/llama.h

index 3475d596502c654ee301cf29d9a7c4eb50d31c47..b04720bee59ef644922638d210632c7621d7edae 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -1088,6 +1088,7 @@ extern "C" {
      /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
      /// @return Returns the number of tokens on success, no more than n_tokens_max
      /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
      /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
      /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
      ///                      as plaintext. Does not insert a leading space.
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 4ab120d9ba818ffdbc201594f5c8af418dc18a38..4aaf4c8250ce54285ff26a34ae812e4f0dcc23dc 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -3074,6 +3074,11 @@ int32_t llama_vocab::tokenize(
                          bool   add_special,
                          bool   parse_special) const {
      auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+    if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
+        return std::numeric_limits<int32_t>::min();
+    }
+    
      if (n_tokens_max < (int) res.size()) {
          // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
          return -((int) res.size());
author	Ruikai Peng <redacted>
	Fri, 20 Jun 2025 14:13:06 +0000 (22:13 +0800)
committer	GitHub <redacted>
	Fri, 20 Jun 2025 14:13:06 +0000 (07:13 -0700)
common/common.cpp		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history