From: Sigbjørn Skjæret Date: Sat, 24 May 2025 10:29:09 +0000 (+0200) Subject: vocab : fix ugm tokenizer precision (#13743) X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c3a2624339187e89c4f65fd72a5fe7103968b5ad;p=pkg%2Fggml%2Fsources%2Fllama.cpp vocab : fix ugm tokenizer precision (#13743) --- diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9389ca80..d5a036a8 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session { } // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores - std::vector tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX}); + std::vector tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX}); // at the beginning tokenization score is zero tokenization_results[0] = { vocab.token_unk(), 0, 0 }; @@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session { const double challenger_score = current_best.score_sum + token_score; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score }; + struct best_tokenization challenger = { token_id, input_offset, challenger_score }; current_champ = challenger; } } @@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session { prefix_offset = input_offset + n_utf8_code_units; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score }; + struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score }; current_champ = challenger; } } @@ -1007,7 +1007,7 @@ private: struct best_tokenization { llama_token token_id; size_t input_offset; - float score_sum; + double score_sum; }; struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {