From: Sigbjørn Skjæret <redacted>
Date: Sat, 24 May 2025 10:29:09 +0000 (+0200)
Subject: vocab : fix ugm tokenizer precision (#13743)
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c3a2624339187e89c4f65fd72a5fe7103968b5ad;p=pkg%2Fggml%2Fsources%2Fllama.cpp

vocab : fix ugm tokenizer precision (#13743)
---

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 9389ca80..d5a036a8 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
         }
 
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
         // at the beginning tokenization score is zero
         tokenization_results[0] = { vocab.token_unk(), 0, 0 };
 
@@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { token_id, input_offset, challenger_score };
                         current_champ = challenger;
                     }
                 }
@@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
                     current_champ = challenger;
                 }
             }
@@ -1007,7 +1007,7 @@ private:
     struct best_tokenization {
         llama_token token_id;
         size_t input_offset;
-        float score_sum;
+        double score_sum;
     };
 
     struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {