* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode.
* Respect add_bos_token GGUF metadata value
* gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
return result;
}
+bool llama_should_add_bos_token(const llama_model * model) {
+ const int add_bos = llama_add_bos_token(model);
+
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
//
// YAML utils
//
llama_context * ctx,
const std::vector<llama_token> & tokens);
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
//
// YAML utils
//
LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str());
}
- const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
+ const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
- eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
+ eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
}
}
- const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
+ const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
- const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
- const bool add_bos = is_spm;
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
- const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
- const bool add_bos = is_spm;
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
// This is needed as usual for LLaMA models
- const bool add_bos = is_spm;
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// Number of tasks to use when computing the score
if ( params.hellaswag_tasks < hs_task_count ) {
bool multimodal = false;
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
+ bool add_bos_token = true;
int32_t id_gen;
int32_t n_ctx; // total context for all clients / slots
n_ctx = llama_n_ctx(ctx);
+ add_bos_token = llama_should_add_bos_token(model);
+
return true;
}
}
void update_system_prompt() {
- system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+ system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
llama_batch_clear(batch);
}
else
{
- prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
+ prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
slot.num_prompt_tokens = prompt_tokens.size();
const bool has_images = process_images(slot);
// process the prefix of first image
- std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
+ std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
- if not tokenizer_file.is_file():
- return False
- with open(tokenizer_file, encoding = 'utf-8') as f:
- tokenizer = json.load(f)
- if self.load_merges:
- merges = tokenizer.get('model', {}).get('merges')
- if isinstance(merges, list) and merges and isinstance(merges[0], str):
- self.merges = merges
+ if tokenizer_file.is_file():
+ with open(tokenizer_file, encoding = 'utf-8') as f:
+ tokenizer = json.load(f)
+ if self.load_merges:
+ merges = tokenizer.get('model', {}).get('merges')
+ if isinstance(merges, list) and merges and isinstance(merges[0], str):
+ self.merges = merges
+ added_tokens = tokenizer.get('added_tokens', {})
+ else:
+ added_tokens = {}
tokenizer_config_file = path / 'tokenizer_config.json'
- added_tokens = tokenizer.get('added_tokens')
- if added_tokens is None or not tokenizer_config_file.is_file():
+ if not tokenizer_config_file.is_file():
return True
with open(tokenizer_config_file, encoding = 'utf-8') as f:
tokenizer_config = json.load(f)
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):
self.add_special_token[typ] = add_entry
+ if not added_tokens:
+ # We will need this to get the content for the token, so if it's empty
+ # may as well just give up.
+ continue
entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str):
tc_content = entry
[tool.poetry]
name = "gguf"
-version = "0.5.2"
+version = "0.5.3"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
else:
curr["value"] = field.parts[-1].tolist()[0]
- for idx, tensor in enumerate(reader.tensors):
- tensors[tensor.name] = {
- "index": idx,
- "shape": tensor.shape.tolist(),
- "type": tensor.tensor_type.name,
- "offset": tensor.field.offset,
- }
+ if not args.no_tensors:
+ for idx, tensor in enumerate(reader.tensors):
+ tensors[tensor.name] = {
+ "index": idx,
+ "shape": tensor.shape.tolist(),
+ "type": tensor.tensor_type.name,
+ "offset": tensor.field.offset,
+ }
json.dump(result, sys.stdout)
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
+ LLM_KV_TOKENIZER_ADD_BOS,
+ LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
};
id special_sep_id = -1;
id special_pad_id = -1;
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
__func__, key.c_str(), id, old_id);
id = old_id;
}
+
+ }
+
+ // Handle add_bos_token and add_eos_token
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
+ int kid = gguf_find_key(ctx, key.c_str());
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
+ }
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
+ kid = gguf_find_key(ctx, key.c_str());
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
}
}
return model->vocab.linefeed_id;
}
+int llama_add_bos_token(const struct llama_model * model) {
+ return model->vocab.special_add_bos;
+}
+
+int llama_add_eos_token(const struct llama_model * model) {
+ return model->vocab.special_add_eos;
+}
+
llama_token llama_token_prefix(const struct llama_model * model) {
return model->vocab.special_prefix_id;
}
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+ // Returns -1 if unknown, 1 for true or 0 for false.
+ LLAMA_API int llama_add_bos_token(const struct llama_model * model);
+
+ // Returns -1 if unknown, 1 for true or 0 for false.
+ LLAMA_API int llama_add_eos_token(const struct llama_model * model);
+
// codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle