whisper : fix UB when reading buffer of length 0 bytes (#265)

author Georgi Gerganov <redacted>

Tue, 13 Dec 2022 21:13:55 +0000 (23:13 +0200)

committer Georgi Gerganov <redacted>

Tue, 13 Dec 2022 21:14:47 +0000 (23:14 +0200)
author Georgi Gerganov <redacted>
Tue, 13 Dec 2022 21:13:55 +0000 (23:13 +0200)
committer Georgi Gerganov <redacted>
Tue, 13 Dec 2022 21:14:47 +0000 (23:14 +0200)
diff --git a/whisper.cpp b/whisper.cpp

index 33f258bc275821514f347b0b158a31c3cf305419..1bc799676bddf4829cf253f01fed33597de5fdd0 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -549,13 +549,20 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
          //}
  
          std::string word;
+        std::vector<char> tmp;
          for (int i = 0; i < n_vocab; i++) {
              uint32_t len;
              read_safe(fin, len);
  
-            std::vector<char> tmp(len); // create a buffer
-            fin.read( &tmp[0], tmp.size() ); // read to buffer
-            word.assign(&tmp[0], tmp.size());
+            if (len > 0) {
+                tmp.resize(len);
+                fin.read(&tmp[0], tmp.size()); // read to buffer
+                word.assign(&tmp[0], tmp.size());
+            } else {
+                // seems like we have an empty-string token in multi-language models (i = 50256)
+                //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
+                word = "";
+            }
  
              vocab.token_to_id[word] = i;
              vocab.id_to_token[i] = word;
author	Georgi Gerganov <redacted>
	Tue, 13 Dec 2022 21:13:55 +0000 (23:13 +0200)
committer	Georgi Gerganov <redacted>
	Tue, 13 Dec 2022 21:14:47 +0000 (23:14 +0200)