struct common_chat_templates {
bool add_bos;
bool add_eos;
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
+ bool has_explicit_template; // Model had builtin template or template overridden was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use;
};
// statistics of a n-gram
struct common_ngram_map_key {
size_t key_idx; // index of key n-gram in token-history
- size_t stat_idx; // index of last token of stastistics computation (key_num, values)
+ size_t stat_idx; // index of last token of statistics computation (key_num, values)
uint16_t key_num; // number of occurrences of this key n-gram in token-history
common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
}
// get extra buffer types of the CPU
- // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+ // TODO: a more general solution for non-CPU extra buft should be implemented in the future
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
std::vector<ggml_backend_buffer_type_t> buft_extra;
{
}
// typical for M-RoPE cases:
- // 0 - sequantial position of the tokens/embeddings in the sequence
+ // 0 - sequential position of the tokens/embeddings in the sequence
// 1 - y position in the image
// 2 - x position in the image
// 3 - other
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
{
- // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+ // TODO: not sure if the following graph would be worst case for multi-stream KV caches:
//
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
//
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
- // but this would make the graph topology depend on the number of output tokens, which can interere with
+ // but this would make the graph topology depend on the number of output tokens, which can interfere with
// features that require constant topology such as pipeline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
//if (n_outputs < n_tokens) {
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
// store k_cur and v_cur in the cache based on the provided head location
- // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+ // note: the heads in k_cur and v_cur should be laid out contiguously in memory
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
// - k_idxs [n_tokens]
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
inpL = build_inp_embd(model.tok_embd);
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
inpL = build_inp_embd(model.tok_embd);
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
inpL = build_inp_embd(model.tok_embd);
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);