From: Sigbjørn Skjæret Date: Thu, 19 Mar 2026 07:34:04 +0000 (+0100) Subject: vocab : assert array size of scores and toktypes (#20737) X-Git-Tag: upstream/0.0.8611~189 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=811397745e161563f118944ae6fab61ebcd1707e;p=pkg%2Fggml%2Fsources%2Fllama.cpp vocab : assert array size of scores and toktypes (#20737) --- diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 68ba292d4..13934339d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2129,19 +2129,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { throw std::runtime_error("cannot find tokenizer vocab in model file\n"); } + const uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx); + const float * scores = nullptr; const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); if (score_idx != -1) { + const uint32_t n_scores = gguf_get_arr_n(ctx, score_idx); + if (n_scores < n_tokens) { + throw std::runtime_error("Index out of array bounds for scores (" + std::to_string(n_scores) + " < " + std::to_string(n_tokens) + ")\n"); + } scores = (const float * ) gguf_get_arr_data(ctx, score_idx); } const int * toktypes = nullptr; const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); if (toktype_idx != -1) { + const uint32_t n_toktypes = gguf_get_arr_n(ctx, toktype_idx); + if (n_toktypes < n_tokens) { + throw std::runtime_error("Index out of array bounds for toktypes (" + std::to_string(n_toktypes) + " < " + std::to_string(n_tokens) + ")\n"); + } toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); } - uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx); id_to_token.resize(n_tokens); for (uint32_t i = 0; i < n_tokens; i++) {