// for now, we apply this workaround to find the tokens based on their text
for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (special_eot_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<end_of_utterance>" // smoldocling
) {
special_eot_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<|eom_id|>"
) {
special_eom_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<|code_prefix|>" // GLM-4.5
) {
special_fim_pre_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<|code_suffix|>" // GLM-4.5
) {
special_fim_suf_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<|code_middle|>" // GLM-4.5
) {
special_fim_mid_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<reponame>" // Granite
) {
special_fim_rep_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
|| t.first == "<|file_sep|>" // Qwen
) {
special_fim_sep_id = t.second;
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
}
}
}
+ // auto-detect unused tokens: e.g. control tokens with the word "unused"
+ // ideally, these tokens should be marked as unused during conversion
+ {
+ uint32_t n_unused = 0;
+
+ for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ continue;
+ }
+
+ if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
+ if (strstr(t.first.c_str(), "unused") != NULL) {
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
+ }
+ }
+
+ if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
+ n_unused++;
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
+ }
+
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
}
for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<end_of_utterance>" // smoldocling
) {
special_eog_ids.insert(t.second);
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
}
} else {
- // token is control, but not marked as EOG -> print a debug log
- if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
- LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
- __func__, t.second, t.first.c_str());
+ if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
+ // token is control, but not marked as EOG -> print a debug log
+ if (special_eog_ids.count(t.second) == 0) {
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+ __func__, t.second, t.first.c_str());
+ }
}
}
}
// @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
}
}
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
for (auto tid : special_eog_ids) {
- LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
+ auto & text = id_to_token[tid].text;
+
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
- if (id_to_token[tid].text == "<|return|>") {
+ if (text == "<|return|>") {
has_return = true;
- } else if (id_to_token[tid].text == "<|call|>" || id_to_token[tid].text == "<|calls|>") {
+ } else if (text == "<|call|>" || text == "<|calls|>") {
has_call = true;
- } else if (id_to_token[tid].text == "<|flush|>") {
+ } else if (text == "<|flush|>") {
has_flush = true;
- } else if (id_to_token[tid].text == "<|end|>") {
+ } else if (text == "<|end|>") {
has_end = true;
end_id = tid;
}
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
special_eog_ids.erase(end_id);
- id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+
+ auto & attr = id_to_token[end_id].attr;
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
}
}