* Superflous parens in conditionals were removed.
* Unused args in function were removed.
* Replaced unused `idx` var with `_`
* Initializing file_format and format_version attributes
* Renaming constant to capitals
* Preventing redefinition of the `f` var
Signed-off-by: Jiri Podivin <redacted>
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
- if (token_id >= vocab_size):
+ if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
for key in added_tokens_json:
token_id = added_tokens_json[key]
- if (token_id >= vocab_size):
+ if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
- if (rope_scaling is None):
+ if rope_scaling is None:
return
scale = max_pos_embds / orig_max_pos_embds
yield name, data
- def set_vocab(self, *args, **kwargs):
+ def set_vocab(self):
tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_class = json.load(f)['tokenizer_class']
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id)
- if (token_id >= vocab_size):
+ if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
- if (token_id >= vocab_size):
+ if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
- with open(save_path, 'wb') as f:
- f.write(response.content)
+ with open(save_path, 'wb') as downloaded_file:
+ downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
- chktok = tokenizer.encode(chktxt)
+ chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
- chktxt = {repr(chktxt)}
+ chktxt = {repr(CHK_TXT)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
"333333333",
"Cửa Việt", # llama-bpe fails on this
" discards",
- chktxt,
+ CHK_TXT,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
class GGMLModel:
+
+ file_format: GGMLFormat
+ format_version: int
+
def __init__(self):
self.hyperparameters = None
self.vocab = None
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
- for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+ for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)