LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
+ LLM_ARCH_NOMIC_BERT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
};
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
- { LLM_ARCH_LLAMA, "llama" },
- { LLM_ARCH_FALCON, "falcon" },
- { LLM_ARCH_GPT2, "gpt2" },
- { LLM_ARCH_GPTJ, "gptj" },
- { LLM_ARCH_GPTNEOX, "gptneox" },
- { LLM_ARCH_MPT, "mpt" },
- { LLM_ARCH_BAICHUAN, "baichuan" },
- { LLM_ARCH_STARCODER, "starcoder" },
- { LLM_ARCH_PERSIMMON, "persimmon" },
- { LLM_ARCH_REFACT, "refact" },
- { LLM_ARCH_BERT, "bert" },
- { LLM_ARCH_BLOOM, "bloom" },
- { LLM_ARCH_STABLELM, "stablelm" },
- { LLM_ARCH_QWEN, "qwen" },
- { LLM_ARCH_QWEN2, "qwen2" },
- { LLM_ARCH_PHI2, "phi2" },
- { LLM_ARCH_PLAMO, "plamo" },
- { LLM_ARCH_CODESHELL, "codeshell" },
- { LLM_ARCH_ORION, "orion" },
- { LLM_ARCH_INTERNLM2, "internlm2" },
- { LLM_ARCH_MINICPM, "minicpm" },
+ { LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_FALCON, "falcon" },
+ { LLM_ARCH_GPT2, "gpt2" },
+ { LLM_ARCH_GPTJ, "gptj" },
+ { LLM_ARCH_GPTNEOX, "gptneox" },
+ { LLM_ARCH_MPT, "mpt" },
+ { LLM_ARCH_BAICHUAN, "baichuan" },
+ { LLM_ARCH_STARCODER, "starcoder" },
+ { LLM_ARCH_PERSIMMON, "persimmon" },
+ { LLM_ARCH_REFACT, "refact" },
+ { LLM_ARCH_BERT, "bert" },
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
+ { LLM_ARCH_BLOOM, "bloom" },
+ { LLM_ARCH_STABLELM, "stablelm" },
+ { LLM_ARCH_QWEN, "qwen" },
+ { LLM_ARCH_QWEN2, "qwen2" },
+ { LLM_ARCH_PHI2, "phi2" },
+ { LLM_ARCH_PLAMO, "plamo" },
+ { LLM_ARCH_CODESHELL, "codeshell" },
+ { LLM_ARCH_ORION, "orion" },
+ { LLM_ARCH_INTERNLM2, "internlm2" },
+ { LLM_ARCH_MINICPM, "minicpm" },
};
enum llm_kv {
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
LLM_KV_EXPERT_USED_COUNT,
+ LLM_KV_POOLING_TYPE,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_OUT_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_LAYER_OUT_NORM,
};
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_POS_EMBD, "position_embd" },
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
- { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
+ {
+ LLM_ARCH_NOMIC_BERT,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
llm_arch arch;
std::string operator()(llm_tensor tensor) const {
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
+ return "__missing__";
+ }
return LLM_TENSOR_NAMES[arch].at(tensor);
}
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
+ return "__missing__";
+ }
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
}
std::string operator()(llm_tensor tensor, int bid) const {
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
+ return "__missing__";
+ }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
+ return "__missing__";
+ }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
+ return "__missing__";
+ }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
}
};
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
- if (numa) { prefetch = 0; }
+ if (numa) { prefetch = 0; }
#ifdef __linux__
// advise the kernel to read the file sequentially (increases readahead)
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
MODEL_22M,
MODEL_33M,
MODEL_109M,
+ MODEL_137M,
MODEL_335M,
MODEL_0_5B,
MODEL_1B,
uint32_t n_yarn_orig_ctx;
int32_t rope_scaling_type_train;
- float f_clamp_kqv;
- float f_max_alibi_bias;
+ float f_clamp_kqv = 0.0f;
+ float f_max_alibi_bias = 0.0f;
bool causal_attn = true;
+ bool need_kq_pos = false;
+ uint32_t pooling_type = LLAMA_POOLING_NONE;
bool operator!=(const llama_hparams & other) const {
if (this->vocab_only != other.vocab_only) return true;
bool mul_mat_q;
bool offload_kqv;
+ bool do_pooling;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
+ struct ggml_tensor * attn_out_norm;
+ struct ggml_tensor * attn_out_norm_b;
// attention
struct ggml_tensor * wq;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * layer_out_norm;
+ struct ggml_tensor * layer_out_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
- struct ggml_tensor * inp_sum; // F32 [1, n_batch]
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
default: return "unknown, may not work";
}
static const char * llama_model_type_name(e_model type) {
switch (type) {
+ case MODEL_22M: return "22M";
+ case MODEL_33M: return "33M";
+ case MODEL_109M: return "109M";
+ case MODEL_137M: return "137M";
+ case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B";
case MODEL_2B: return "2B";
case MODEL_3B: return "3B";
case 40: model.type = e_model::MODEL_13B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
+
+ if (model.type == e_model::MODEL_13B) {
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
+ }
} break;
case LLM_ARCH_STARCODER:
{
case 32: model.type = e_model::MODEL_1B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
+
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
} break;
case LLM_ARCH_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
switch (hparams.n_layer) {
case 3:
model.type = e_model::MODEL_335M; break; // bge-large
}
} break;
+ case LLM_ARCH_NOMIC_BERT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+ model.type = e_model::MODEL_137M;
+ }
+ } break;
case LLM_ARCH_BLOOM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
case 4096: model.type = e_model::MODEL_7B; break;
} break;
}
+
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
} break;
case LLM_ARCH_MPT:
{
- hparams.f_clamp_kqv = 0.0f;
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
}
model.ftype = ml.ftype;
+
+ if (hparams.f_max_alibi_bias > 0.0f) {
+ hparams.need_kq_pos = true;
+ }
}
// TODO: This should probably be in llama.h
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+ try {
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+ } catch (const std::exception & e) {
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+ vocab.linefeed_id = vocab.special_pad_id;
+ }
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.linefeed_id = vocab.special_pad_id;
} else {
}
} break;
case LLM_ARCH_BERT:
+ case LLM_ARCH_NOMIC_BERT:
{
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
+ if (model.arch == LLM_ARCH_BERT) {
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+ }
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+ if (model.arch == LLM_ARCH_BERT) {
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+ } else {
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ }
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ if (model.arch == LLM_ARCH_BERT) {
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ } else {
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ }
+
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
}
} break;
case LLM_ARCH_BLOOM:
model.hparams.vocab_only = params.vocab_only;
- llm_load_arch (ml, model);
- llm_load_hparams(ml, model);
- llm_load_vocab (ml, model);
+ try {
+ llm_load_arch(ml, model);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+ }
+ try {
+ llm_load_hparams(ml, model);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+ }
+ try {
+ llm_load_vocab(ml, model);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+ }
llm_load_print_meta(ml, model);
struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
+ struct ggml_tensor * kq_pos,
int64_t n_ctx,
int32_t n_tokens,
int32_t n_kv,
- float max_alibi_bias,
float kq_scale,
const llm_build_cb & cb,
int il) {
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
}
- if (max_alibi_bias > 0.0f) {
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
+#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
+#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
+#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
+ if (hparams.f_max_alibi_bias > 0.0f) {
kq = ggml_scale(ctx, kq, kq_scale);
cb(kq, "kq_scaled", il);
- if (max_alibi_bias > 0.0f) {
- // TODO: n_head or n_head_kv
- // TODO: K-shift is likely not working
- // TODO: change to ggml_add
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
- cb(kq, "kq_scaled_alibi", il);
- }
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
+ cb(kq, "kq_scaled_alibi", il);
kq = ggml_add(ctx, kq, kq_mask);
cb(kq, "kq_masked", il);
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
- } else {
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
+ } else
+#endif
+ {
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
}
struct ggml_tensor * v_cur,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
+ struct ggml_tensor * kq_pos,
int64_t n_ctx,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
- float max_alibi_bias,
float kq_scale,
const llm_build_cb & cb,
int il) {
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
- wo, wo_b,
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il);
return cur;
const int32_t n_orig_ctx;
const bool do_rope_shift;
- const bool causal_attn;
+ const uint32_t pooling_type;
const llm_build_cb & cb;
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
n_orig_ctx (cparams.n_yarn_orig_ctx),
do_rope_shift (worst_case || kv_self.has_shift),
- causal_attn (hparams.causal_attn),
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
cb (cb),
buf_compute_meta (lctx.buf_compute_meta) {
// all initializations should be done in init()
}
Qcur = ggml_rope_custom(
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
+ // positions of the tokens in the KV cache
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+ cb(KQ_pos, "KQ_pos", -1);
+
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
cb(Kcur, "Kcur", il);
- // apply ALiBi for 13B model
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
+ // positions of the tokens in the KV cache
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+ cb(KQ_pos, "KQ_pos", -1);
+
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
// get input vectors with right size
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
- struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
// construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+
// token types are hardcoded to zero ("Sentence A")
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0);
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+ if (model.arch == LLM_ARCH_BERT) {
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+ }
cb(inpL, "inp_embd", -1);
// embed layer norm
struct ggml_tensor * cur = inpL;
// self-attention
- {
+ if (model.arch == LLM_ARCH_BERT) {
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ cb(cur, "kqv_out", il);
+ } else {
+ // compute Q and K and RoPE them
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = ggml_add(ctx0, cur, inpL);
// attention layer norm
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
struct ggml_tensor * ffn_inp = cur;
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
- cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ if (model.arch == LLM_ARCH_BERT) {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+ NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ } else {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ }
cb(cur, "ffn_out", il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
// output layer norm
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
// input for next layer
inpL = cur;
// final output
cur = inpL;
- // pooling
- cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
- cb(cur, "result_embed", -1);
+ // pooling layer
+ if (pooling_type == LLAMA_POOLING_MEAN) {
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
+ } else {
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
+ }
+ cb(cur, "result_embd", -1);
ggml_build_forward_expand(gf, cur);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
+ // positions of the tokens in the KV cache
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+ cb(KQ_pos, "KQ_pos", -1);
+
inpL = llm_build_norm(ctx0, inpL, hparams,
model.tok_norm,
model.tok_norm_b,
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
+ // positions of the tokens in the KV cache
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+ cb(KQ_pos, "KQ_pos", -1);
+
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * attn_norm;
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * sa_out = cur;
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
result = llm.build_refact();
} break;
case LLM_ARCH_BERT:
+ case LLM_ARCH_NOMIC_BERT:
{
result = llm.build_bert();
} break;
for (int i = 0; i < n_kv; ++i) {
float f;
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
f = -INFINITY;
} else {
f = 0;
}
}
+ if (hparams.need_kq_pos) {
+ const int64_t n_kv = kv_self.n;
- {
- assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
- float * data = (float *) lctx.inp_sum->data;
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
+
+ float * data = (float *) lctx.inp_KQ_pos->data;
- for (int i = 0; i < batch.n_tokens; ++i) {
- data[i] = 1.0f/float(batch.n_tokens);
+ for (int i = 0; i < n_kv; ++i) {
+ data[i] = float(lctx.kv_self.cells[i].pos);
}
}
data[i] = lctx.kv_self.cells[i].delta;
}
}
+
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
+ const int64_t n_tokens = batch.n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
+ float * data = (float *) lctx.inp_mean->data;
+
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+
+ std::vector<uint64_t> sum(n_tokens, 0);
+ for (int i = 0; i < n_tokens; ++i) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ sum[seq_id] += 1;
+ }
+
+ std::vector<float> div(n_tokens, 0.0f);
+ for (int i = 0; i < n_tokens; ++i) {
+ const uint64_t s = sum[i];
+ if (s > 0) {
+ div[i] = 1.0f/float(s);
+ }
+ }
+
+ for (int i = 0; i < n_tokens; ++i) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ data[seq_id*n_tokens + i] = div[seq_id];
+ }
+ }
+
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
+ const int64_t n_tokens = batch.n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
+
+ for (int i = 0; i < n_tokens; ++i) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ const llama_pos pos = batch.pos[i];
+ if (pos == 0) {
+ data[seq_id] = i;
+ }
+ }
+ }
}
// decode a batch of tokens by evaluating the transformer
embeddings = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
}
- } else if (strcmp(res->name, "result_embed") == 0) {
+ } else if (strcmp(res->name, "result_embd") == 0) {
embeddings = res;
res = nullptr;
} else {
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
- const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
- embedding_out.resize(n_embd);
+ embedding_out.resize(embd_size);
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
ggml_backend_synchronize(embeddings_backend);
}
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
- return vocab.token_to_id.at(buf);
+ auto token = vocab.token_to_id.find(buf);
+ if (token != vocab.token_to_id.end()) {
+ return (*token).second;
+ }
+ // Try to fall back to just the byte as a string
+ const char buf2[2] = { (char)ch, 0 };
+ return vocab.token_to_id.at(buf2);
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {
};
struct llm_tokenizer_spm {
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars
if (p == rev_merge.end()) {
// output any symbols that did not form tokens as bytes.
+ output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
output.push_back(token_id);
token(_token),
raw_text(_dummy),
offset(0),
- length(0){}
+ length(0) {}
+
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
- token((llama_vocab::id)-1),
+ token((llama_vocab::id) - 1),
raw_text(_raw_text),
offset(_offset),
length(_length){
- GGML_ASSERT( _offset >= 0 );
- GGML_ASSERT( _length >= 1 );
- GGML_ASSERT( offset + length <= raw_text.length() );
+ GGML_ASSERT(_offset >= 0);
+ GGML_ASSERT(_length >= 1);
+ GGML_ASSERT(offset + length <= raw_text.length());
}
const FRAGMENT_BUFFER_VARIANT_TYPE type;
}
std::forward_list<fragment_buffer_variant> fragment_buffer;
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM:
{
- for (const auto & fragment: fragment_buffer) {
+ for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
- for (const auto & fragment: fragment_buffer) {
+ for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
} break;
case LLAMA_VOCAB_TYPE_WPM:
{
- for (const auto & fragment: fragment_buffer) {
+ for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
} else if (name == "token_embd.weight") {
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_Q4_K;
}
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = GGML_TYPE_Q2_K;
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
++qs.i_ffn_down;
}
+ else if (name.find("attn_output.weight") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
+ }
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
++qs.i_ffn_up;
}
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
- new_type == GGML_TYPE_IQ3_XXS) {
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ1_S:
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
quantize &= !params->only_copy;
// do not quantize expert gating tensors
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
+
+ // do not quantize positional embeddings and token types (BERT)
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
enum ggml_type new_type;
void * new_data;
}
if ((new_type == GGML_TYPE_IQ2_XXS ||
new_type == GGML_TYPE_IQ2_XS ||
+ new_type == GGML_TYPE_IQ1_S ||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
{
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
__func__, ftype);
- return false;
+ return 1;
}
}
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.offload_kqv =*/ true,
+ /*.do_pooling =*/ true,
};
return result;
return llama_supports_mlock();
}
-void llama_backend_init(bool numa) {
+void llama_backend_init(void) {
ggml_time_init();
// needed to initialize f16 tables
ggml_free(ctx);
}
- if (numa) {
- ggml_numa_init();
- }
-
#ifdef GGML_USE_MPI
ggml_mpi_backend_init();
#endif
}
+void llama_numa_init(enum ggml_numa_strategy numa) {
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
+ ggml_numa_init(numa);
+ }
+}
+
void llama_backend_free(void) {
#ifdef GGML_USE_MPI
ggml_mpi_backend_free();
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.mul_mat_q = params.mul_mat_q;
cparams.offload_kqv = params.offload_kqv;
+ cparams.do_pooling = params.do_pooling;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
// resized during inference, reserve maximum
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
- if (params.embedding){
+ if (params.embedding) {
ctx->embedding.resize(hparams.n_embd);
}
// graph inputs
{
ggml_init_params init_params = {
- /* .mem_size */ ggml_tensor_overhead()*7,
+ /* .mem_size */ ggml_tensor_overhead()*8,
/* .mem_buffer */ nullptr,
/* .no_alloc */ true,
};
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
- ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ggml_set_name(ctx->inp_tokens, "inp_tokens");
ggml_set_name(ctx->inp_embd, "inp_embd");
ggml_set_name(ctx->inp_pos, "inp_pos");
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
- ggml_set_name(ctx->inp_sum, "inp_sum");
+ ggml_set_name(ctx->inp_mean, "inp_mean");
+ ggml_set_name(ctx->inp_cls, "inp_cls");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
return ctx->embedding.data();
}
+float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
+}
+
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
return model->vocab.id_to_token[token].text.c_str();
}
return 0;
}
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+ size_t start = 0;
+ size_t end = str.size();
+ while (start < end && isspace(str[start])) {
+ start += 1;
+ }
+ while (end > start && isspace(str[end - 1])) {
+ end -= 1;
+ }
+ return str.substr(start, end - start);
+}
+
+// Simple version of "llama_apply_chat_template" that only works with strings
+// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
+static int32_t llama_chat_apply_template_internal(
+ const std::string & tmpl,
+ const std::vector<const llama_chat_message *> & chat,
+ std::string & dest, bool add_ass) {
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
+ std::stringstream ss;
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
+ // chatml template
+ for (auto message : chat) {
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
+ }
+ if (add_ass) {
+ ss << "<|im_start|>assistant\n";
+ }
+ } else if (tmpl.find("[INST]") != std::string::npos) {
+ // llama2 template and its variants
+ // [variant] support system message
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
+ // [variant] space before + after response
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
+ // [variant] add BOS inside history
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
+ // [variant] trim spaces from the input message
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
+ // construct the prompt
+ bool is_inside_turn = true; // skip BOS at the beginning
+ ss << "[INST] ";
+ for (auto message : chat) {
+ std::string content = strip_message ? trim(message->content) : message->content;
+ std::string role(message->role);
+ if (!is_inside_turn) {
+ is_inside_turn = true;
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+ }
+ if (role == "system") {
+ if (support_system_message) {
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+ } else {
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+ ss << content << "\n";
+ }
+ } else if (role == "user") {
+ ss << content << " [/INST]";
+ } else {
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
+ is_inside_turn = false;
+ }
+ }
+ // llama2 templates seem to not care about "add_generation_prompt"
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
+ // zephyr template
+ for (auto message : chat) {
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else {
+ // template not supported
+ return -1;
+ }
+ dest = ss.str();
+ return dest.size();
+}
+
+LLAMA_API int32_t llama_chat_apply_template(
+ const struct llama_model * model,
+ const char * tmpl,
+ const struct llama_chat_message * chat,
+ size_t n_msg,
+ bool add_ass,
+ char * buf,
+ int32_t length) {
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
+ if (tmpl == nullptr) {
+ GGML_ASSERT(model != nullptr);
+ // load template from model
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
+ std::string template_key = "tokenizer.chat_template";
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
+ if (res < 0) {
+ // worst case: there is no information about template, we will use chatml by default
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
+ } else {
+ curr_tmpl = std::string(model_template.data(), model_template.size());
+ }
+ }
+ // format the chat to string
+ std::vector<const llama_chat_message *> chat_vec;
+ chat_vec.resize(n_msg);
+ for (size_t i = 0; i < n_msg; i++) {
+ chat_vec[i] = &chat[i];
+ }
+ std::string formatted_chat;
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
+ if (res < 0) {
+ return res;
+ }
+ strncpy(buf, formatted_chat.c_str(), length);
+ return res;
+}
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,