gguf : make token scores and types optional (#3347)

author Cebtenzzre <redacted>

Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)

committer GitHub <redacted>

Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)
author Cebtenzzre <redacted>
Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)
committer GitHub <redacted>
Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py

index 88338d823b03585275be7c9f36ba9b4c7e32abd5..958358563ccdcfca15691c38c4e04aa1db45c6b8 100755 (executable)
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
  print("gguf: get tokenizer metadata")
  
  tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
  
  tokenizer_json_file = dir_model / 'tokenizer.json'
  if not tokenizer_json_file.is_file():
@@ -177,12 +175,8 @@ for i in range(vocab_size):
          text = bytearray(pad_token)
  
      tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy
  
  gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
  
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
  special_vocab.add_to_gguf(gguf_writer)
diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py

index 331e84e985a2f1fd2e11423101777364962123de..48e88a777fea1db20a1f42633203d326578de16e 100755 (executable)
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
  print("gguf: get tokenizer metadata")
  
  tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
  
  tokenizer_json_file = dir_model / 'tokenizer.json'
  if not tokenizer_json_file.is_file():
@@ -161,12 +159,8 @@ for i in range(vocab_size):
          text = bytearray(pad_token)
  
      tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy
  
  gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
  
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
  special_vocab.add_to_gguf(gguf_writer)
diff --git a/llama.cpp b/llama.cpp

index 140533553c93e7e2149d3cd76ae9a1d1a93312e3..15de7600c16382246019e471b7b34d837f743552 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1931,20 +1931,18 @@ static void llm_load_vocab(
          throw std::runtime_error("cannot find tokenizer vocab in model file\n");
      }
  
+    const float * scores = nullptr;
      const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
-    if (score_idx == -1) {
-        throw std::runtime_error("cannot find tokenizer scores in model file\n");
+    if (score_idx != -1) {
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
      }
  
-    const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-
+    const int * toktypes = nullptr;
      const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
-    if (toktype_idx == -1) {
-        throw std::runtime_error("cannot find token type list in GGUF file\n");
+    if (toktype_idx != -1) {
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
      }
  
-    const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
-
      // determine vocab type
      {
          std::string tokenizer_name;
@@ -2012,8 +2010,8 @@ static void llm_load_vocab(
  
          auto & token_data = vocab.id_to_token[i];
          token_data.text  = std::move(word);
-        token_data.score = scores[i];
-        token_data.type  = (llama_token_type) toktypes[i];
+        token_data.score = scores ? scores[i] : 0.0f;
+        token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
      }
  
      // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
author	Cebtenzzre <redacted>
	Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)
committer	GitHub <redacted>
	Thu, 28 Sep 2023 18:30:15 +0000 (14:30 -0400)
convert-falcon-hf-to-gguf.py		patch \| blob \| history
convert-starcoder-hf-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history