vocab : fix ugm tokenizer precision (#13743)

author Sigbjørn Skjæret <redacted>

Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)

committer GitHub <redacted>

Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)
author Sigbjørn Skjæret <redacted>
Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)
committer GitHub <redacted>
Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 9389ca805a584fadcb9110b0841f95234907b8b7..d5a036a8c4413cb91c6d242ec2f049ced6b3cc62 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
          }
  
          // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
          // at the beginning tokenization score is zero
          tokenization_results[0] = { vocab.token_unk(), 0, 0 };
  
@@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
                      const double challenger_score = current_best.score_sum + token_score;
                      struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                      if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { token_id, input_offset, challenger_score };
                          current_champ = challenger;
                      }
                  }
@@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
                  prefix_offset = input_offset + n_utf8_code_units;
                  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                  if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
                      current_champ = challenger;
                  }
              }
@@ -1007,7 +1007,7 @@ private:
      struct best_tokenization {
          llama_token token_id;
          size_t input_offset;
-        float score_sum;
+        double score_sum;
      };
  
      struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
author	Sigbjørn Skjæret <redacted>
	Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)
committer	GitHub <redacted>
	Sat, 24 May 2025 10:29:09 +0000 (12:29 +0200)