#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
+// bump if necessary
#define LLAMA_MAX_NODES 8192
-#define LLAMA_MAX_EXPERTS 160
+#define LLAMA_MAX_LAYERS 256
+#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
//
// logging
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
LLM_ARCH_COMMAND_R,
LLM_ARCH_DBRX,
LLM_ARCH_OLMO,
+ LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC,
LLM_ARCH_DEEPSEEK2,
+ LLM_ARCH_CHATGLM,
LLM_ARCH_BITNET,
+ LLM_ARCH_T5,
+ LLM_ARCH_JAIS,
LLM_ARCH_UNKNOWN,
};
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" },
+ { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
+ { LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" },
+ { LLM_ARCH_T5, "t5" },
+ { LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
+ LLM_KV_DECODER_START_TOKEN_ID,
+ LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+ LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_CAUSAL,
LLM_KV_ATTENTION_Q_LORA_RANK,
LLM_KV_ATTENTION_KV_LORA_RANK,
+ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+ LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_TOKENIZER_ADD_BOS,
LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_ADD_PREFIX,
+ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
+ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_PREFIX_ID,
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
-
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
+ { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
+ { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
+
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
- { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
- { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
- { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
- { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
- { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
- { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
- { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
- { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
- { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
- { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
- { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
- { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
- { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
- { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
- { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
- { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
- { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
- { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
- { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
- { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
- { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
- { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
- { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
+ { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
+ { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
+ { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
+ { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
+ { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
+ { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
+ { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
+ { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
+ { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
+ { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
+ { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
+ { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
+ { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
};
struct LLM_KV {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
+ LLM_TENSOR_DEC_ATTN_NORM,
+ LLM_TENSOR_DEC_ATTN_Q,
+ LLM_TENSOR_DEC_ATTN_K,
+ LLM_TENSOR_DEC_ATTN_V,
+ LLM_TENSOR_DEC_ATTN_OUT,
+ LLM_TENSOR_DEC_ATTN_REL_B,
+ LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+ LLM_TENSOR_DEC_CROSS_ATTN_Q,
+ LLM_TENSOR_DEC_CROSS_ATTN_K,
+ LLM_TENSOR_DEC_CROSS_ATTN_V,
+ LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+ LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+ LLM_TENSOR_DEC_FFN_NORM,
+ LLM_TENSOR_DEC_FFN_GATE,
+ LLM_TENSOR_DEC_FFN_DOWN,
+ LLM_TENSOR_DEC_FFN_UP,
+ LLM_TENSOR_DEC_OUTPUT_NORM,
+ LLM_TENSOR_ENC_ATTN_NORM,
+ LLM_TENSOR_ENC_ATTN_Q,
+ LLM_TENSOR_ENC_ATTN_K,
+ LLM_TENSOR_ENC_ATTN_V,
+ LLM_TENSOR_ENC_ATTN_OUT,
+ LLM_TENSOR_ENC_ATTN_REL_B,
+ LLM_TENSOR_ENC_FFN_NORM,
+ LLM_TENSOR_ENC_FFN_GATE,
+ LLM_TENSOR_ENC_FFN_DOWN,
+ LLM_TENSOR_ENC_FFN_UP,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
};
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_OPENELM,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_ARCTIC,
{
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
+ {
+ LLM_ARCH_CHATGLM,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ },
+ },
{
LLM_ARCH_BITNET,
{
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
},
},
+ {
+ LLM_ARCH_T5,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
+ { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" },
+ { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" },
+ { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" },
+ { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" },
+ { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" },
+ { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
+ { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" },
+ { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" },
+ { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" },
+ { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" },
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
+ },
+ },
+ {
+ LLM_ARCH_JAIS,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ },
+ },
{
LLM_ARCH_UNKNOWN,
{
// NOTE: avoid ever using this except for building the token_to_piece caches
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
- std::vector<char> result(8, 0);
- const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
- if (n_tokens < 0) {
- result.resize(-n_tokens);
- int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
- GGML_ASSERT(check == -n_tokens);
+ std::string piece;
+ piece.resize(piece.capacity()); // using string internal cache
+ const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
}
else {
- result.resize(n_tokens);
+ piece.resize(n_chars);
}
- return std::string(result.data(), result.size());
+ return piece;
}
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
MODEL_17M,
MODEL_22M,
MODEL_33M,
+ MODEL_60M,
MODEL_70M,
+ MODEL_80M,
MODEL_109M,
MODEL_137M,
MODEL_160M,
+ MODEL_220M,
+ MODEL_250M,
+ MODEL_270M,
MODEL_335M,
MODEL_410M,
+ MODEL_450M,
+ MODEL_770M,
+ MODEL_780M,
MODEL_0_5B,
MODEL_1B,
+ MODEL_1_3B,
MODEL_1_4B,
MODEL_2B,
MODEL_2_8B,
MODEL_3B,
MODEL_4B,
+ MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
+ MODEL_9B,
+ MODEL_11B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_57B_A14B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
- uint32_t n_head;
- uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
+ uint32_t n_swa = 0; // sliding window attention (SWA)
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
- uint32_t n_ff;
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_vocab_type = 0; // for BERT-style token types
+ uint32_t n_rel_attn_bkts = 0;
+
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
float f_norm_eps;
float f_norm_rms_eps;
+ float f_attn_logit_softcapping = 50.0f;
+ float f_final_logit_softcapping = 30.0f;
+
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f;
- bool causal_attn = true;
- bool use_alibi = false;
+ bool causal_attn = true;
+ bool use_alibi = false;
+ bool attn_soft_cap = false;
+
+ // needed by encoder-decoder models (e.g. T5, FLAN-T5)
+ // ref: https://github.com/ggerganov/llama.cpp/pull/8141
+ llama_token dec_start_token_id = -1;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
if (this->n_vocab != other.n_vocab) return true;
if (this->n_ctx_train != other.n_ctx_train) return true;
if (this->n_embd != other.n_embd) return true;
- if (this->n_head != other.n_head) return true;
- if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true;
+ if (this->n_swa != other.n_swa) return true;
if (this->n_embd_head_k != other.n_embd_head_k) return true;
if (this->n_embd_head_v != other.n_embd_head_v) return true;
- if (this->n_ff != other.n_ff) return true;
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
+ if (this->n_head_arr != other.n_head_arr) return true;
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
+ if (this->n_ff_arr != other.n_ff_arr) return true;
+
+ if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
if (this->n_lora_q != other.n_lora_q) return true;
if (this->n_lora_kv != other.n_lora_kv) return true;
if (this->ssm_d_state != other.ssm_d_state) return true;
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
+ if (this->dec_start_token_id != other.dec_start_token_id) return true;
+
const float EPSILON = 1e-9f;
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
return false;
}
- uint32_t n_gqa() const {
+ uint32_t n_head(uint32_t il = 0) const {
+ if (il < n_layer) {
+ return n_head_arr[il];
+ }
+
+ GGML_ASSERT(false);
+ return 0;
+ }
+
+ uint32_t n_head_kv(uint32_t il = 0) const {
+ if (il < n_layer) {
+ return n_head_kv_arr[il];
+ }
+
+ GGML_ASSERT(false);
+ return 0;
+ }
+
+ uint32_t n_ff(uint32_t il = 0) const {
+ if (il < n_layer) {
+ return n_ff_arr[il];
+ }
+
+ GGML_ASSERT(false);
+ return 0;
+ }
+
+ uint32_t n_gqa(uint32_t il = 0) const {
+ const uint32_t n_head = this->n_head(il);
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
if (n_head_kv == 0) {
return 0;
}
+
return n_head/n_head_kv;
}
- uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
+ uint32_t n_embd_k_gqa(uint32_t il = 0) const { // dimension of key embeddings across all k-v heads
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
return n_embd_head_k * n_head_kv;
}
- uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
+ uint32_t n_embd_v_gqa(uint32_t il = 0) const { // dimension of value embeddings across all k-v heads
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
return n_embd_head_v * n_head_kv;
}
}
};
+static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
+
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
void * cb_eval_user_data;
};
+// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
struct ggml_tensor * attn_sub_norm;
+ struct ggml_tensor * attn_post_norm;
struct ggml_tensor * ffn_sub_norm;
+ struct ggml_tensor * attn_norm_cross;
+ struct ggml_tensor * attn_norm_enc;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wq_b;
struct ggml_tensor * wkv_a_mqa;
struct ggml_tensor * wkv_b;
+ struct ggml_tensor * wq_cross;
+ struct ggml_tensor * wk_cross;
+ struct ggml_tensor * wv_cross;
+ struct ggml_tensor * wo_cross;
+ struct ggml_tensor * wq_enc;
+ struct ggml_tensor * wk_enc;
+ struct ggml_tensor * wv_enc;
+ struct ggml_tensor * wo_enc;
// attention bias
struct ggml_tensor * bq;
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
+ // relative position bias
+ struct ggml_tensor * attn_rel_b;
+ struct ggml_tensor * attn_rel_b_enc;
+ struct ggml_tensor * attn_rel_b_cross;
+
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
+ struct ggml_tensor * ffn_norm_enc;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
+ struct ggml_tensor * ffn_gate_enc;
+ struct ggml_tensor * ffn_down_enc;
+ struct ggml_tensor * ffn_up_enc;
// ff MoE
struct ggml_tensor * ffn_gate_inp;
int32_t layer_start = -1;
int32_t layer_end = -1;
- ggml_tensor * tensor_for(int il) const {
+ struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
+ struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
+ ggml_tensor * layer_dir = tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx, cur, layer_dir);
+ }
+ return cur;
+ }
+
~llama_control_vector() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
// tokenizer flags
- bool tokenizer_add_space_prefix = true;
+ bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
+ bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
+ bool tokenizer_remove_extra_whitespaces = false;
+ bool tokenizer_escape_whitespaces = true;
+ bool tokenizer_treat_whitespace_as_suffix = false;
+
+ std::vector<char> precompiled_charsmap;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
struct ggml_tensor * output_b;
+ struct ggml_tensor * output_norm_enc;
std::vector<llama_layer> layers;
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;
+ // whether we are computing encoder output or decoder output
+ bool is_encoding = false;
+
+ // output of the encoder part of the encoder-decoder models
+ std::vector<float> embd_enc;
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr;
void * abort_callback_data = nullptr;
// input tensors
- struct ggml_tensor * inp_tokens; // I32 [n_batch]
- struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
- struct ggml_tensor * inp_pos; // I32 [n_batch]
- struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
- struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
- struct ggml_tensor * inp_K_shift; // I32 [kv_size]
- struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
- struct ggml_tensor * inp_cls; // I32 [n_batch]
- struct ggml_tensor * inp_s_copy; // I32 [kv_size]
- struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
- struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
+ struct ggml_tensor * inp_tokens; // I32 [n_batch]
+ struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
+ struct ggml_tensor * inp_pos; // I32 [n_batch]
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
+ struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
+ struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
+ struct ggml_tensor * inp_K_shift; // I32 [kv_size]
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
+ struct ggml_tensor * inp_s_copy; // I32 [kv_size]
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
+ struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
+ struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
+ struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
// control vectors
struct llama_control_vector cvec;
const struct llama_hparams & hparams = model.hparams;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
- const int64_t n_layer = hparams.n_layer;
+ const int64_t n_layer = hparams.n_layer;
cache.has_shift = false;
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
cache.v_trans = !cparams.flash_attn;
- // TODO: support mixed recurrent Transformer architectures
- // NOTE: (!a || b) is a logical implication (a -> b)
- GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
- GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
- GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
- GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
-
cache.head = 0;
cache.size = kv_size;
cache.used = 0;
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+ // If there is no range then return early to avoid looping over the cache.
+ if (p0 == p1) return;
if (cache.recurrent) {
// for Mamba-like models, only the pos needs to be shifted
int d) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+ // If there is no range then return early to avoid looping over the cache.
+ if (p0 == p1) return;
if (cache.recurrent) {
// for Mamba-like models, only the pos needs to be changed
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
const int kid = gguf_find_key(meta, key.c_str());
- if (kid < 0) {
+ if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
if (required) {
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
}
return false;
}
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
- if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
- throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
+ switch (arr_info.gt) {
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+ case GGUF_TYPE_INT32: GGML_ASSERT(
+ (std::is_same<T, int32_t>::value) ||
+ (std::is_same<T, uint32_t>::value)); break;
+ default:
+ throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
}
- // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
- GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
- GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
-
result.resize(arr_info.length);
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
return true;
}
+ template<typename T, size_t N_MAX>
+ bool get_arr(const std::string & key, std::array<T, N_MAX> & result, const bool required = true) {
+ const int kid = gguf_find_key(meta, key.c_str());
+
+ if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
+ if (required) {
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+
+ switch (arr_info.gt) {
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+ case GGUF_TYPE_INT32: GGML_ASSERT(
+ (std::is_same<T, int32_t>::value) ||
+ (std::is_same<T, uint32_t>::value)); break;
+ default:
+ throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+ }
+
+ GGML_ASSERT(arr_info.length <= N_MAX);
+
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+
+ return true;
+ }
+
template<typename T>
- bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
+ bool get_arr(const enum llm_kv kid, T & result, const bool required = true) {
return get_arr(llm_kv(kid), result, required);
}
return get_key(llm_kv(kid), result, required);
}
+ // get array of n <= N_MAX elements, or a single element repeated n times
+ template<typename T, size_t N_MAX>
+ bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
+ GGML_ASSERT(n <= N_MAX);
+
+ const int kid = gguf_find_key(meta, key.c_str());
+
+ if (kid < 0) {
+ if (required) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
+
+ if (n != arr_info.length) {
+ throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
+ }
+
+ return get_arr(key, result, required);
+ } else {
+ T value;
+
+ bool ok = get_key(key, value, required);
+ if (!ok) {
+ return false;
+ }
+
+ for (uint32_t i = 0; i < n; i++) {
+ result[i] = value;
+ }
+
+ return true;
+ }
+ }
+
+ template<typename T>
+ bool get_key_or_arr(const enum llm_kv kid, T & result, uint32_t n, const bool required = true) {
+ return get_key_or_arr(llm_kv(kid), result, n, required);
+ }
+
std::string get_arch_name() const {
return arch_name;
}
#if defined(GGML_USE_CUDA)
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
- constexpr size_t num_buffers = 4;
+ constexpr size_t n_buffers = 4;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
std::vector<ggml_backend_buffer_t> host_buffers;
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
if (cuda_backend) {
- for (size_t idx = 0; idx < num_buffers; ++idx) {
+ for (size_t idx = 0; idx < n_buffers; ++idx) {
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
events.emplace_back(ggml_backend_event_new(cuda_backend));
bytes_read += read_iteration;
++buffer_idx;
- buffer_idx %= num_buffers;
+ buffer_idx %= n_buffers;
}
}
else
#if defined(GGML_USE_CUDA)
// free temporary resources used for async cuda uploads
if (cuda_backend) {
- for (size_t idx = 0; idx < num_buffers;++idx) {
+ for (size_t idx = 0; idx < n_buffers;++idx) {
ggml_backend_event_synchronize(events[idx]);
ggml_backend_event_free(events[idx]);
ggml_backend_buffer_free(host_buffers[idx]);
case MODEL_17M: return "17M";
case MODEL_22M: return "22M";
case MODEL_33M: return "33M";
+ case MODEL_60M: return "60M";
case MODEL_70M: return "70M";
+ case MODEL_80M: return "80M";
case MODEL_109M: return "109M";
case MODEL_137M: return "137M";
case MODEL_160M: return "160M";
+ case MODEL_220M: return "220M";
+ case MODEL_250M: return "250M";
+ case MODEL_270M: return "270M";
case MODEL_335M: return "335M";
case MODEL_410M: return "410M";
+ case MODEL_450M: return "450M";
+ case MODEL_770M: return "770M";
+ case MODEL_780M: return "780M";
case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B";
+ case MODEL_1_3B: return "1.3B";
case MODEL_1_4B: return "1.4B";
case MODEL_2B: return "2B";
case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B";
case MODEL_4B: return "4B";
+ case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
+ case MODEL_9B: return "9B";
+ case MODEL_11B: return "11B";
case MODEL_12B: return "12B";
case MODEL_13B: return "13B";
case MODEL_14B: return "14B";
case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
+ case MODEL_57B_A14B: return "57B.A14B";
+ case MODEL_27B: return "27B";
default: return "?B";
}
}
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
default: return "unknown";
}
}
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
// get hparams kv
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
return;
}
- ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
- ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
- ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
- ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
- ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
- ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
- ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
GGML_ASSERT(hparams.n_expert_used == 0);
}
+ // zero-out the per-layer hparams
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+
// n_head_kv is optional, default to n_head
- hparams.n_head_kv = hparams.n_head;
- ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
+ hparams.n_head_kv_arr = hparams.n_head_arr;
+
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
bool rope_finetuned = false;
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
- // sanity check for n_rot (optional)
- {
- hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
+ // non-transformer models do not have attention heads
+ if (hparams.n_head() > 0) {
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+ // gpt-j n_rot = rotary_dim
+
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+
+ // sanity check for n_rot (optional)
+ hparams.n_rot = hparams.n_embd_head_k;
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+ if (hparams.n_rot != hparams.n_embd_head_k) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
}
- // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
- // gpt-j n_rot = rotary_dim
+ } else {
+ hparams.n_rot = 0;
+ hparams.n_embd_head_k = 0;
+ hparams.n_embd_head_v = 0;
}
- hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
-
- hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
-
// arch-specific KVs
switch (model.arch) {
case LLM_ARCH_LLAMA:
case 40: model.type = e_model::MODEL_13B; break;
case 48: model.type = e_model::MODEL_34B; break;
case 60: model.type = e_model::MODEL_30B; break;
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
+ case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
}
switch (hparams.n_layer) {
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
+ case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
case 80: model.type = e_model::MODEL_70B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_A2_7B; break;
+ case 28: model.type = e_model::MODEL_57B_A14B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ hparams.n_swa = 4096; // default value of gemma 2
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+ hparams.attn_soft_cap = true;
+
+ switch (hparams.n_layer) {
+ case 42: model.type = e_model::MODEL_9B; break;
+ case 46: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_OPENELM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 16: model.type = e_model::MODEL_270M; break;
+ case 20: model.type = e_model::MODEL_450M; break;
+ case 28: model.type = e_model::MODEL_1B; break;
+ case 36: model.type = e_model::MODEL_3B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_GPTNEOX:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
switch (hparams.n_layer) {
case 6:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 512: model.type = e_model::MODEL_14M; break;
case 2048: model.type = e_model::MODEL_70M; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 12:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 3072: model.type = e_model::MODEL_160M; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 16:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 8192: model.type = e_model::MODEL_1B; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 24:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 4096: model.type = e_model::MODEL_410M; break;
case 8192: model.type = e_model::MODEL_1_4B; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 32:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 10240: model.type = e_model::MODEL_2_8B; break;
case 16384: model.type = e_model::MODEL_6_9B; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 36:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 20480: model.type = e_model::MODEL_12B; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
case 44:
- switch (hparams.n_ff) {
+ switch (hparams.n_ff()) {
case 24576: model.type = e_model::MODEL_20B; break;
default: model.type = e_model::MODEL_UNKNOWN;
} break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_CHATGLM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 28: model.type = e_model::MODEL_6B; break;
+ case 40: model.type = e_model::MODEL_9B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_T5:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+
+ uint32_t dec_start_token_id;
+ if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
+ hparams.dec_start_token_id = dec_start_token_id;
+ }
+
+ switch (hparams.n_layer) {
+ case 6: model.type = e_model::MODEL_60M; break; // t5-small
+ case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
+ case 12:
+ switch (hparams.n_ff()) {
+ case 3072: model.type = e_model::MODEL_220M; break; // t5-base
+ case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
+ default: model.type = e_model::MODEL_UNKNOWN;
+ } break;
+ case 24:
+ switch (hparams.n_ff()) {
+ case 4096: model.type = e_model::MODEL_770M; break; // t5-large
+ case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
+ case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
+ case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
+ case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
+ case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
+ default: model.type = e_model::MODEL_UNKNOWN;
+ } break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+
+ switch (hparams.n_layer) {
+ case 24: model.type = e_model::MODEL_1_3B; break;
+ case 40: model.type = e_model::MODEL_13B; break;
+ /* TODO: add variants */
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
default: (void)0;
}
vocab.special_pad_id = -1;
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
-
- const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
- if (add_space_prefix_keyidx != -1) {
- vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
- } // The default value of add_space_prefix is true.
} else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;
vocab.special_pad_id = 0;
vocab.special_cls_id = 101;
vocab.special_mask_id = 103;
- vocab.tokenizer_add_space_prefix = false;
} else if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;
- const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
- if (add_space_prefix_keyidx != -1) {
- vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
- }
-
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
-
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
-
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
vocab.special_pad_id = -1;
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
+ } else if (tokenizer_model == "t5") {
+ vocab.type = LLAMA_VOCAB_TYPE_UGM;
+
+ // default special tokens
+ vocab.special_bos_id = -1;
+ vocab.special_eos_id = 1;
+ vocab.special_unk_id = 2;
+ vocab.special_sep_id = -1;
+ vocab.special_pad_id = 0;
+ vocab.special_cls_id = -1;
+ vocab.special_mask_id = -1;
+
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
+ if (add_space_prefix_keyidx != -1) {
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
+ } // The default value of add_space_prefix is true.
+
+ const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
+ if (remove_extra_whitespaces_keyidx != -1) {
+ vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
+ } // The default value of remove_extra_whitespaces is false.
+
+ const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+ if (precompiled_charsmap_keyidx != -1) {
+ size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+ vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
+#ifdef IS_BIG_ENDIAN
+ // correct endiannes of data in precompiled_charsmap binary blob
+ uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
+ *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+ assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+ size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+ uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
+ for (size_t i = 0; i < xcda_array_size; ++i) {
+ xcda_array[i] = __builtin_bswap32(xcda_array[i]);
+ }
+#endif
+ }
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+ vocab.tokenizer_add_space_prefix = false;
+ vocab.tokenizer_clean_spaces = true;
if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
} else if (
tokenizer_pre == "deepseek-llm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+ vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+ vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "falcon") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
} else if (
tokenizer_pre == "gpt-2" ||
+ tokenizer_pre == "phi-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
tokenizer_pre == "jina-v2-es" ||
} else if (
tokenizer_pre == "qwen2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "stablelm2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
} else if (
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+ vocab.tokenizer_clean_spaces = false;
+ } else if (
+ tokenizer_pre == "chatglm-bpe") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
+ vocab.special_bos_id = -1;
+ } else if (
+ tokenizer_pre == "viking") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
+ vocab.tokenizer_clean_spaces = false;
+ } else if (
+ tokenizer_pre == "jais") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ vocab.tokenizer_add_space_prefix = true;
+ vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_add_eos = false;
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ vocab.tokenizer_add_space_prefix = false;
+ vocab.tokenizer_clean_spaces = true;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_add_eos = false;
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ vocab.tokenizer_add_bos = false;
+ vocab.tokenizer_add_eos = true;
} else {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
+
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
+ if (add_space_prefix_keyidx != -1) {
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
+ }
}
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA
&& 32010 < vocab.id_to_token.size()
- && vocab.id_to_token[32007].text == "<PRE>"
- && vocab.id_to_token[32008].text == "<SUF>"
- && vocab.id_to_token[32009].text == "<MID>"
- && vocab.id_to_token[32010].text == "<EOT>") {
+ && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
+ && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
+ && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
+ && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009;
vocab.special_eot_id = 107;
}
}
-
try {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} catch (const std::exception & e) {
}
}
- std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
+ std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
[&] (const llama_vocab::id a, const llama_vocab::id b) {
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
}
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
- // hparams
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
- if (ml.n_elements >= 1e12) {
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
+ auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+ bool is_var = false;
+
+ std::vector<uint32_t> v;
+ for (uint32_t i = 0; i < n; ++i) {
+ v.push_back(f(i));
+ if (v[i] != v[0]) {
+ is_var = true;
+ }
+ }
+
+ std::stringstream ss;
+
+ if (is_var) {
+ ss << "[";
+ for (uint32_t i = 0; i < n; ++i) {
+ ss << v[i];
+ if (i < n - 1) {
+ ss << ", ";
+ }
+ }
+ ss << "]";
+ } else {
+ ss << v[0];
+ }
+
+ return ss.str();
+ };
+
+ // hparams
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
+
+ if (!hparams.vocab_only) {
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
+ }
+
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
+ if (ml.n_elements >= 1e12) {
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
} else if (ml.n_elements >= 1e9) {
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
} else if (ml.n_elements >= 1e6) {
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
- const int64_t n_layer = hparams.n_layer;
- const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
+ const int n_layer = hparams.n_layer;
+ const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
bool use_mmap_buffer = true;
// there is very little benefit to offloading the input layer, so always keep it on the CPU
model.buft_layer.resize(n_layer);
// assign cpu layers
- for (int64_t i = 0; i < i_gpu_start; ++i) {
+ for (int i = 0; i < i_gpu_start; ++i) {
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
}
// assign the repeating layers to the devices according to the splits
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
- for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+ for (int i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
}
split_buft = llama_default_buffer_type_offload(model, main_gpu);
}
// assign the repeating layers
- for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+ for (int i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = {
split_buft,
llama_default_buffer_type_offload(model, main_gpu)
buft_layer_count[model.buft_input.buft_matrix]++;
buft_layer_count[model.buft_output.buft]++;
buft_layer_count[model.buft_output.buft_matrix]++;
- for (int64_t i = 0; i < n_layer; ++i) {
+ for (int i = 0; i < n_layer; ++i) {
buft_layer_count[model.buft_layer[i].buft]++;
buft_layer_count[model.buft_layer[i].buft_matrix]++;
}
// create tensors for the weights
{
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = hparams.n_vocab;
- const int64_t n_vocab_type = hparams.n_vocab_type;
- const int64_t n_ff = hparams.n_ff;
- const int64_t n_expert = hparams.n_expert;
+ // note: cast to int64_t since we will use these for the tensor dimensions
+ const int64_t n_head = hparams.n_head();
+ const int64_t n_head_kv = hparams.n_head_kv();
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_ff = hparams.n_ff();
+ const int64_t n_embd_gqa = n_embd_v_gqa;
+ const int64_t n_vocab = hparams.n_vocab;
+ const int64_t n_vocab_type = hparams.n_vocab_type;
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_ctx_train = hparams.n_ctx_train;
if (n_expert > 0 && hparams.n_expert_used == 0) {
throw std::runtime_error("model has expert layers but no expert layers are used");
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
- auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
- auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
+
+ auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
+ auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
model.layers.resize(n_layer);
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
-
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
if (layer.ffn_gate_exps) {
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
// output
{
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
}
case LLM_ARCH_STARCODER:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
// output
{
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
case LLM_ARCH_BERT:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
+
if (model.arch == LLM_ARCH_BERT) {
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
}
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
auto & layer = model.layers[i];
if (model.arch == LLM_ARCH_BERT) {
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
} else {
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
}
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
if (model.arch == LLM_ARCH_BERT) {
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
-
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
} else {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_JINA_BERT_V2:
{
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); // token_type_embeddings
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
auto & layer = model.layers[i]; // JinaBertLayer
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
- layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
- layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
- layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
}
} break;
case LLM_ARCH_BLOOM:
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
case LLM_ARCH_MPT:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
// output
{
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
}
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
// optional q and k layernorms, present in StableLM 2 12B
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
- GGML_ASSERT(hparams.n_expert > 0);
- GGML_ASSERT(hparams.n_expert_used > 0);
+ GGML_ASSERT(n_expert > 0);
+ GGML_ASSERT(n_expert_used > 0);
// MoE branch
- auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
// Shared expert branch
- auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp});
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp});
}
} break;
case LLM_ARCH_PHI2:
} break;
case LLM_ARCH_PHI3:
{
+ const int64_t n_embd_head = n_embd / n_head;
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
// output
}
for (int i = 0; i < n_layer; ++i) {
- ggml_context* ctx_layer = ctx_for_layer(i);
- ggml_context* ctx_split = ctx_for_layer_split(i);
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
case LLM_ARCH_GPT2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
- const int64_t n_ff = hparams.n_ff;
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
- for (uint32_t i = 0; i < n_layer; ++i) {
+ for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
}
} break;
case LLM_ARCH_STARCODER2:
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
const int64_t dt_rank = hparams.ssm_dt_rank;
+
// only an expansion factor of 2 is supported for now
GGML_ASSERT(2 * n_embd == d_inner);
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
+
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
+
auto & layer = model.layers[i];
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
if (n_layer >= 64){
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
}
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
+ case LLM_ARCH_OPENELM:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ // init output from the input tok embed
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ const int64_t n_head = hparams.n_head(i);
+ const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
+ const int64_t n_ff = hparams.n_ff(i);
+
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k});
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k});
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
case LLM_ARCH_GPTNEOX:
{
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
// output
{
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
} break;
case LLM_ARCH_DEEPSEEK2:
{
- bool is_lite = (hparams.n_layer == 27);
+ const bool is_lite = (hparams.n_layer == 27);
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t q_lora_rank = hparams.n_lora_q;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
- const uint32_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
if (!is_lite) {
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
}
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
if (!is_lite) {
- layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
- layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
} else {
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
}
- layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
- layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
+
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- if ((uint32_t) i < hparams.n_layer_dense_lead) {
+ if (i < (int) hparams.n_layer_dense_lead) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
} else {
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
- GGML_ASSERT(hparams.n_expert > 0);
- GGML_ASSERT(hparams.n_expert_used > 0);
+ GGML_ASSERT(n_expert > 0);
+ GGML_ASSERT(n_expert_used > 0);
// MoE branch
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
// Shared expert branch
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd});
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
}
}
} break;
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
- layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
- layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
- layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
- layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
+ }
+ } break;
+ case LLM_ARCH_T5:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd});
+
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+
+ layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+
+ layer.attn_norm_cross = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd});
+ // this tensor seems to be unused in HF transformers implementation
+ layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.wq_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wk_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // Output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
+
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff});
+
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ }
+ } break;
+ case LLM_ARCH_CHATGLM:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + (hparams.n_embd_head_k << 2)});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + (hparams.n_embd_head_k << 2)});
+
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2});
+
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
}
} break;
default:
LLM_FFN_GELU,
LLM_FFN_RELU,
LLM_FFN_RELU_SQR,
+ LLM_FFN_SWIGLU,
};
enum llm_ffn_gate_type {
int64_t il) {
const int64_t n_ctx = cparams.n_ctx;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
GGML_ASSERT(kv.size == n_ctx);
struct ggml_tensor * cur,
struct ggml_tensor * up,
struct ggml_tensor * up_b,
+ struct ggml_tensor * up_s,
struct ggml_tensor * gate,
struct ggml_tensor * gate_b,
+ struct ggml_tensor * gate_s,
struct ggml_tensor * down,
struct ggml_tensor * down_b,
+ struct ggml_tensor * down_s,
struct ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
cb(tmp, "ffn_up_b", il);
}
+ if (up_s) {
+ tmp = ggml_mul(ctx, tmp, up_s);
+ cb(tmp, "ffn_up_s", il);
+ }
+
if (gate) {
switch (type_gate) {
case LLM_FFN_SEQ:
cur = ggml_add(ctx, cur, gate_b);
cb(cur, "ffn_gate_b", il);
}
+
+ if (gate_s) {
+ cur = ggml_mul(ctx, cur, gate_s);
+ cb(cur, "ffn_gate_s", il);
+ }
+
} else {
cur = tmp;
}
cur = ggml_sqr(ctx, cur);
cb(cur, "ffn_sqr(relu)", il);
} break;
+ case LLM_FFN_SWIGLU:
+ {
+ // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+ int64_t split_point = cur->ne[0] / 2;
+ struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
+ struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+ x0 = ggml_silu(ctx, x0);
+ cb(cur, "ffn_silu", il);
+
+ cur = ggml_mul(ctx, x0, x1);
+ cb(cur, "ffn_mul", il);
+ } break;
}
if (type_gate == LLM_FFN_PAR) {
cb(cur, "ffn_gate_par", il);
}
- cur = ggml_mul_mat(ctx, down, cur);
+ if (down) {
+ cur = ggml_mul_mat(ctx, down, cur);
+ }
+
if (down_b) {
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx, cur, down_b);
}
+ if (down_s) {
+ cur = ggml_mul(ctx, cur, down_s);
+ cb(cur, "ffn_down_s", il);
+ }
+
return cur;
}
const llm_build_cb & cb,
int il) {
const int64_t n_ctx = cparams.n_ctx;
- const int64_t n_head = hparams.n_head;
- const int64_t n_head_kv = hparams.n_head_kv;
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_head_v = hparams.n_embd_head_v;
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
cb(q, "q", il);
kq = ggml_scale(ctx, kq, 30);
}
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+ if (hparams.attn_soft_cap) {
+ kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
+ kq = ggml_tanh(ctx, kq);
+ kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
+ }
+
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
GGML_ASSERT(kv.size == n_ctx);
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
const int32_t n_outputs;
+ const int32_t n_outputs_enc;
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_ctx_orig;
n_layer (hparams.n_layer),
n_rot (hparams.n_rot),
n_ctx (cparams.n_ctx),
- n_head (hparams.n_head),
- n_head_kv (hparams.n_head_kv),
+ n_head (hparams.n_head()),
+ n_head_kv (hparams.n_head_kv()),
n_embd_head_k (hparams.n_embd_head_k),
n_embd_k_gqa (hparams.n_embd_k_gqa()),
n_embd_head_v (hparams.n_embd_head_v),
n_tokens (batch.n_tokens),
n_kv (worst_case ? kv_self.size : kv_self.n),
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
+ n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
n_ctx_orig (cparams.n_ctx_orig_yarn),
flash_attn (cparams.flash_attn),
ctx0 = ggml_init(params);
- lctx.inp_tokens = nullptr;
- lctx.inp_embd = nullptr;
- lctx.inp_pos = nullptr;
- lctx.inp_out_ids = nullptr;
- lctx.inp_KQ_mask = nullptr;
- lctx.inp_K_shift = nullptr;
- lctx.inp_mean = nullptr;
- lctx.inp_cls = nullptr;
- lctx.inp_s_copy = nullptr;
- lctx.inp_s_mask = nullptr;
- lctx.inp_s_seq = nullptr;
+ lctx.inp_tokens = nullptr;
+ lctx.inp_embd = nullptr;
+ lctx.inp_pos = nullptr;
+ lctx.inp_out_ids = nullptr;
+ lctx.inp_KQ_mask = nullptr;
+ lctx.inp_KQ_mask_swa = nullptr;
+ lctx.inp_K_shift = nullptr;
+ lctx.inp_mean = nullptr;
+ lctx.inp_cls = nullptr;
+ lctx.inp_s_copy = nullptr;
+ lctx.inp_s_mask = nullptr;
+ lctx.inp_s_seq = nullptr;
+ lctx.inp_pos_bucket = nullptr;
+ lctx.inp_embd_enc = nullptr;
+ lctx.inp_KQ_mask_cross = nullptr;
}
void free() {
cb(lctx.inp_K_shift, "K_shift", -1);
ggml_set_input(lctx.inp_K_shift);
-
for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
struct ggml_tensor * rope_factors = build_rope_factors(il);
struct ggml_tensor * tmp =
// we rotate only the first n_rot dimensions
}
for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
}
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
- if (causal) {
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
- } else {
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
- }
+ lctx.inp_KQ_mask = causal
+ ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+ : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
ggml_set_input(lctx.inp_KQ_mask);
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
}
+ struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
+ GGML_ASSERT(hparams.n_swa > 0);
+
+ lctx.inp_KQ_mask_swa = causal
+ ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+ : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+ cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
+ ggml_set_input(lctx.inp_KQ_mask_swa);
+
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
+ }
+
struct ggml_tensor * build_inp_mean() {
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
cb(lctx.inp_mean, "inp_mean", -1);
return gf;
}
+ struct ggml_tensor * llm_build_pos_bucket(bool causal) {
+ if (causal) {
+ lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+ } else {
+ lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+ }
+
+ ggml_set_input(lctx.inp_pos_bucket);
+ cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+
+ return lctx.inp_pos_bucket;
+ }
+
+ struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+ struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
+ cb(pos_bucket_1d, "pos_bucket_1d", -1);
+
+ struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+ cb(pos_bias, "pos_bias", -1);
+
+ pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0);
+ cb(pos_bias, "pos_bias", -1);
+
+ pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+ cb(pos_bias, "pos_bias", -1);
+
+ pos_bias = ggml_cont(ctx0, pos_bias);
+ cb(pos_bias, "pos_bias", -1);
+
+ return pos_bias;
+ }
+
+ struct ggml_tensor * llm_build_inp_embd_enc() {
+ const int64_t n_embd = hparams.n_embd;
+ lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+ ggml_set_input(lctx.inp_embd_enc);
+ cb(lctx.inp_embd_enc, "embd_enc", -1);
+ return lctx.inp_embd_enc;
+ }
+
+ struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
+ lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+ ggml_set_input(lctx.inp_KQ_mask_cross);
+ cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
+ return lctx.inp_KQ_mask_cross;
+ }
+
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// feed forward
{
cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
- model.layers[il].ffn_up, NULL,
- NULL, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
-
cur = ggml_add(ctx0, cur, inpL);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, hparams,
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// feed-forward network
if (model.arch == LLM_ARCH_BERT) {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
} else {
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
}
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, hparams,
LLM_NORM, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_act,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cur = inpSA;
}
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur_gate, "ffn_shexp_gate", il);
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up_shexp, NULL,
- model.layers[il].ffn_gate_shexp, NULL,
- model.layers[il].ffn_down_shexp, NULL,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur_ffn, "ffn_shexp", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// FF
{
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(ffn_output, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_output);
- cb(cur, "l_out", il);
-
cur = ggml_add(ctx0, cur, inpL);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
+ // input for next layer
inpL = cur;
}
// special-case: the up and gate tensors are merged into a single tensor
// TOOD: support into llm_build_ffn
{
- struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
- cb(up, "ffn_up", il);
-
- auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
- auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
-
- y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
- cb(y, "ffn_gate", il);
-
- auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
- cb(down, "ffn_down", il);
-
- cur = down;
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, residual, cur);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
+ // input for next layer
inpL = cur;
}
// feed-forward network
{
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, sa_out);
- cb(cur, "l_out", il);
-
cur = ggml_add(ctx0, cur, inpL);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, hparams,
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, hparams,
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cb(cur, "hidden_scaled_ffn", -1);
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
- n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
- n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il);
// feed-forward network
{
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ // gemma 2 requires different mask for layers using sliding window (SWA)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
+ struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
+
+ for (int il = 0; il < n_layer; ++il) {
+ // (il % 2) layers use SWA
+ struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
cur = ggml_add(ctx0, cur, sa_out);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
+
+ // final logit soft-capping
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
+
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// residual
cur = ggml_add(ctx0, cur, inpL);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
// feed-forward network
{
cur = llm_build_ffn(ctx0, ffn_inp,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// add together residual + FFN + self-attention
cur = ggml_add(ctx0, cur, inpL);
cur = ggml_add(ctx0, cur, attn_out);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
return gf;
}
- struct ggml_cgraph * build_gptneox() {
+ struct ggml_cgraph * build_openelm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_head_qkv = 2*n_head_kv + n_head;
+
+ cur = inpL;
+ struct ggml_tensor * residual = cur;
+
+ // norm
cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, cb, il);
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+ cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
cb(Vcur, "Vcur", il);
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
+ model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(Qcur, "Qcur", il);
+
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
+ model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(Kcur, "Kcur", il);
+
Qcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
+ ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
+ ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
+ Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
+ cb(Qcur, "Vcur", il);
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
- model.layers[il].wo, model.layers[il].bo,
+ model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
}
- // ffn
- if (hparams.use_par_res) {
- // attention and ffn are computed in parallel
- // x = x + attn(ln1(x)) + ffn(ln2(x))
-
- struct ggml_tensor * attn_out = cur;
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+ cb(ffn_inp, "ffn_inp", il);
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, cb, il);
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
+ }
- cur = ggml_add(ctx0, cur, inpL);
- cb(cur, "ffn_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
- inpL = ggml_add(ctx0, cur, attn_out);
- cb(inpL, "l_out", il);
- } else {
- // attention and ffn are computed sequentially
- // x = x + attn(ln1(x))
- // x = x + ffn(ln2(x))
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_gptneox() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // ffn
+ if (hparams.use_par_res) {
+ // attention and ffn are computed in parallel
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+ struct ggml_tensor * attn_out = cur;
+
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, attn_out);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ } else {
+ // attention and ffn are computed sequentially
+ // x = x + attn(ln1(x))
+ // x = x + ffn(ln2(x))
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
}
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_out);
cb(cur, "ffn_out", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
// FFN shared expert
{
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up_shexp, NULL,
- model.layers[il].ffn_gate_shexp, NULL,
- model.layers[il].ffn_down_shexp, NULL,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(ffn_shexp, "ffn_shexp", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
- cur = inpL;
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm, NULL,
- LLM_NORM_RMS, cb, -1);
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_bitnet() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ // B1.K
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ // B1.V
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ NULL, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_sub_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_sub_norm", il);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+ if (model.layers[il].bo) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
+ }
+ cb(cur, "attn_o_out", il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward forward
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+ NULL, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_sub_out", il);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_sub_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_sub_norm", il);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+ cb(cur, "ffn_down", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+ return gf;
+ }
+
+ struct ggml_cgraph * build_t5() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ if (lctx.is_encoding) {
+ struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
+
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
+
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
+
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+ cb(v, "v", il);
+
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
+
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
+
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ ggml_build_forward_expand(gf, cur);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
+ cb(cur, "kqv_out", il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up_enc, NULL, NULL,
+ model.layers[il].ffn_gate_enc, NULL, NULL,
+ model.layers[il].ffn_down_enc, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
+ }
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+ } else {
+ struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
+ struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+
+ struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
+ struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+
+ struct ggml_tensor * k =
+ ggml_view_3d(ctx0, kv_self.k_l[il],
+ n_embd_head_k, n_kv, n_head_kv,
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+ 0);
+ cb(k, "k", il);
+
+ struct ggml_tensor * v =
+ ggml_view_3d(ctx0, kv_self.v_l[il],
+ n_kv, n_embd_head_v, n_head_kv,
+ ggml_element_size(kv_self.v_l[il])*n_ctx,
+ ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+ 0);
+ cb(v, "v", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
+
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
+
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
+
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+ cb(kqv, "kqv", il);
+
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
+
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ ggml_build_forward_expand(gf, cur);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+ cb(cur, "kqv_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "cross_inp", il);
+
+ struct ggml_tensor * inpCA = cur;
+
+ // norm
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_norm_cross, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm_cross", il);
+
+ // cross-attention
+ {
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
+
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
+
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+ cb(v, "v", il);
+
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
+
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
+
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ ggml_build_forward_expand(gf, cur);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
+ cb(cur, "kqv_out", il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
+ }
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+ }
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_jais() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // add the input
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
- // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
+
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
- struct ggml_cgraph * build_bitnet() {
+ struct ggml_cgraph * build_chatglm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpSA = inpL;
cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm, NULL,
+ model.layers[il].attn_norm,
+ NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
- // compute Q and K and RoPE them
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
+ struct ggml_tensor * Qcur = nullptr;
+ struct ggml_tensor * Kcur = nullptr;
+ struct ggml_tensor * Vcur = nullptr;
- // B1.K
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
- // B1.V
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+ //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
- cb(Qcur, "Qcur", il);
+ cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
- cb(Kcur, "Kcur", il);
+ cb(Kcur, "Kcur_rope", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
- nullptr, nullptr,
+ model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
- cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].attn_sub_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_sub_norm", il);
-
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
- if (model.layers[il].bo) {
- cur = ggml_add(ctx0, cur, model.layers[il].bo);
- }
- cb(cur, "attn_o_out", il);
}
if (il == n_layer - 1) {
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
+ // Add the input
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
- // feed-forward forward
- if (model.layers[il].ffn_gate_inp == nullptr) {
+ // FF
+ {
cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm, NULL,
+ model.layers[il].ffn_norm,
+ NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
- struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
- tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
- cb(tmp, "ffn_up", il);
-
- cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
- cb(cur, "ffn_gate", il);
-
- cur = ggml_silu(ctx0, cur);
- cb(cur, "ffn_silu", il);
-
- cur = ggml_mul(ctx0, cur, tmp);
- cb(cur, "ffn_gate_par", il);
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].ffn_sub_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_sub_norm", il);
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
- cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
- cb(cur, "ffn_down", il);
}
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
}
- cur = inpL;
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm, NULL,
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.output_norm,
+ NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
- // lm_head
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+ cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
+
return gf;
}
-
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
{
result = llm.build_olmo();
} break;
+ case LLM_ARCH_OPENELM:
+ {
+ result = llm.build_openelm();
+ } break;
case LLM_ARCH_GPTNEOX:
{
result = llm.build_gptneox();
{
result = llm.build_deepseek2();
} break;
+ case LLM_ARCH_CHATGLM:
+ {
+ result = llm.build_chatglm();
+ } break;
case LLM_ARCH_BITNET:
{
result = llm.build_bitnet();
} break;
+ case LLM_ARCH_T5:
+ {
+ result = llm.build_t5();
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ result = llm.build_jais();
+ } break;
default:
GGML_ASSERT(false);
}
}
}
+static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+ // TODO move to hparams if a T5 variant appears that uses a different value
+ const int64_t max_distance = 128;
+
+ if (bidirectional) {
+ n_buckets >>= 1;
+ }
+
+ const int64_t max_exact = n_buckets >> 1;
+
+ int32_t relative_position = x - y;
+ int32_t relative_bucket = 0;
+ if (bidirectional) {
+ relative_bucket += (relative_position > 0) * n_buckets;
+ relative_position = abs(relative_position);
+ } else {
+ relative_position = -std::min<int32_t>(relative_position, 0);
+ }
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+ return relative_bucket;
+}
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
//
// set input data
if (lctx.inp_KQ_mask) {
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
- if (cparams.causal_attn) {
+ if (cparams.causal_attn && !lctx.is_encoding) {
const int64_t n_kv = kv_self.n;
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
- float * data = (float *) lctx.inp_KQ_mask->data;
+ float * data = (float *) lctx.inp_KQ_mask->data;
+ float * data_swa = nullptr;
+
+ if (lctx.inp_KQ_mask_swa) {
+ data_swa = (float *) lctx.inp_KQ_mask_swa->data;
+ }
// For causal attention, use only the previous KV cells
// of the correct sequence for each token of the batch.
}
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+
+ // may need to cut off old tokens for sliding window
+ if (data_swa) {
+ if (pos - lctx.kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
+ f = -INFINITY;
+ }
+ data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+ }
}
}
} else {
// when using kv cache, the mask needs to match the kv cache size
const int64_t n_tokens = batch.n_tokens;
- const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
+ const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
}
}
- if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_mean);
}
}
- if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
}
}
- if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
}
}
}
+
+ if (lctx.inp_pos_bucket) {
+ const int64_t n_tokens = batch.n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
+
+ int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
+
+ if (!lctx.is_encoding) {
+ const int64_t n_kv = kv_self.n;
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_kv; ++i) {
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+ }
+ }
+ }
+ } else {
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_tokens; ++i) {
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+ }
+ }
+ }
+ }
+ }
+
+ if (!lctx.is_encoding && lctx.inp_embd_enc) {
+ assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
+ assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
+
+ ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
+ }
+
+ if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
+ const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
+ const int64_t n_tokens = batch.n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
+
+ float * data = (float *) lctx.inp_KQ_mask_cross->data;
+
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_output_enc; ++i) {
+ float f = -INFINITY;
+ for (int s = 0; s < batch.n_seq_id[j]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[j][s];
+ if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
+ f = 0.0f;
+ }
+ }
+ data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
+ }
+ }
+
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+ for (int j = 0; j < n_output_enc; ++j) {
+ data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
+ }
+ }
+ }
+ }
}
// Make sure enough space is available for outputs.
// TODO: use a per-batch flag for logits presence instead
const bool has_logits = !cparams.embeddings;
- const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+ const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
llama_context & lctx,
llama_batch batch_all) { // TODO: rename back to batch
+ lctx.is_encoding = false;
const uint32_t n_tokens_all = batch_all.n_tokens;
if (n_tokens_all == 0) {
const auto n_ubatch = cparams.n_ubatch;
+ // TODO: simplify or deprecate
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id_arr;
std::vector<std::vector<llama_seq_id>> seq_id;
+ // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+ const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
// count outputs
- if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
- n_outputs = n_tokens_all;
- } else if (batch_all.logits) {
+ if (batch_all.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
}
- } else if (lctx.logits_all) {
+ } else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
} else {
// keep last output only
{
int32_t n_outputs_new = 0;
- if (u_batch.logits) {
+ if (u_batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens; i++) {
n_outputs_new += u_batch.logits[i] != 0;
}
return 0;
}
+// encode a batch of tokens by evaluating the encoder part of the transformer
+//
+// - lctx: llama context
+// - batch: batch to evaluate
+//
+// return 0 on success
+// return positive int on warning
+// return negative int on error
+//
+static int llama_encode_internal(
+ llama_context & lctx,
+ llama_batch batch) {
+
+ lctx.is_encoding = true;
+
+ const uint32_t n_tokens = batch.n_tokens;
+
+ if (n_tokens == 0) {
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
+ return -1;
+ }
+
+ const auto & model = lctx.model;
+ const auto & hparams = model.hparams;
+ const auto & cparams = lctx.cparams;
+
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+ // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+ GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+ if (lctx.t_compute_start_us == 0) {
+ lctx.t_compute_start_us = ggml_time_us();
+ }
+
+ lctx.n_queued_tokens += n_tokens;
+
+ const int64_t n_embd = hparams.n_embd;
+
+ // TODO: simplify or deprecate
+ std::vector<llama_pos> pos;
+ std::vector<int32_t> n_seq_id;
+ std::vector<llama_seq_id *> seq_id_arr;
+ std::vector<std::vector<llama_seq_id>> seq_id;
+
+ // reserve output buffer
+ if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+ return -2;
+ };
+
+ for (uint32_t i = 0; i < n_tokens; ++i) {
+ lctx.output_ids[i] = i;
+ }
+
+ lctx.inp_embd_enc = NULL;
+ lctx.n_outputs = n_tokens;
+
+ const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+ GGML_ASSERT(n_threads > 0);
+
+ // helpers for smoother batch API transition
+ // after deprecating the llama_eval calls, these will be removed
+ if (batch.pos == nullptr) {
+ pos.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
+ }
+
+ batch.pos = pos.data();
+ }
+
+ if (batch.seq_id == nullptr) {
+ n_seq_id.resize(n_tokens);
+ seq_id.resize(n_tokens);
+ seq_id_arr.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ n_seq_id[i] = 1;
+ seq_id[i].resize(1);
+ seq_id[i][0] = batch.all_seq_id;
+ seq_id_arr[i] = seq_id[i].data();
+ }
+
+ batch.n_seq_id = n_seq_id.data();
+ batch.seq_id = seq_id_arr.data();
+ }
+
+ ggml_backend_sched_reset(lctx.sched);
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
+
+ // the output embeddings after the final encoder normalization
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1];
+
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
+
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
+
+ llama_set_inputs(lctx, batch);
+
+ llama_graph_compute(lctx, gf, n_threads);
+
+ // extract embeddings
+ if (embd) {
+ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+ GGML_ASSERT(backend_embd != nullptr);
+
+ // extract token embeddings
+ GGML_ASSERT(lctx.embd != nullptr);
+
+ lctx.embd_enc.resize(n_tokens*n_embd);
+ float * embd_out = lctx.embd_enc.data();
+
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+
+ // remember the sequence ids used during the encoding - needed for cross attention later
+ lctx.seq_ids_enc.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
+ llama_seq_id seq_id = batch.seq_id[i][s];
+ lctx.seq_ids_enc[i].insert(seq_id);
+ }
+ }
+ }
+
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+ // overlap with device computation.
+ ggml_backend_sched_reset(lctx.sched);
+
+ return 0;
+}
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
+static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) {
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
- case LLAMA_VOCAB_TYPE_SPM: {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
static const char * hex = "0123456789ABCDEF";
switch (llama_vocab_get_type(vocab)) {
- case LLAMA_VOCAB_TYPE_SPM: {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = vocab.token_to_id.find(buf);
if (token != vocab.token_to_id.end()) {
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
case LLAMA_VOCAB_PRE_TYPE_OLMO:
+ case LLAMA_VOCAB_PRE_TYPE_JAIS:
regex_exprs = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
- case LLAMA_VOCAB_PRE_TYPE_PORO:
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
+ regex_exprs = {
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
+ regex_exprs = {
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_VIKING:
regex_exprs = {
+ "\\p{N}",
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
const llama_vocab & vocab;
};
+struct naive_trie {
+ naive_trie() : has_value(false), value(0) {
+ }
+ void insert(const char * key, size_t len, int32_t value = 0) {
+ if (len == 0) {
+ this->has_value = true;
+ this->value = value;
+ return;
+ }
+ char c = key[0];
+ auto res = children.find(c);
+ if (res != children.end()) {
+ res->second.insert(key + 1, len - 1, value);
+ } else {
+ auto res = children.insert(std::make_pair(c, naive_trie()));
+ res.first->second.insert(key + 1, len - 1, value);
+ }
+ }
+ std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
+ if (len == 0 || offset == len) {
+ return std::make_pair(key, offset);
+ }
+ char c = key[offset];
+ auto res = children.find(c);
+ if (res != children.end()) {
+ return res->second.get_longest_prefix(key, len, offset + 1);
+ } else {
+ return std::make_pair(key, offset);
+ }
+ }
+ struct naive_trie * traverse(const char c) {
+ auto res = children.find(c);
+ if (res != children.end()) {
+ return &res->second;
+ } else {
+ return NULL;
+ }
+ }
+ std::map<char, struct naive_trie> children;
+ bool has_value;
+ llama_token value;
+};
+
+struct llm_tokenizer_ugm {
+ llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
+ if (vocab.precompiled_charsmap.size() > 0) {
+ size_t charsmap_offset = 0;
+
+ // First four bytes of precompiled_charsmap contains length of binary
+ // blob containing XOR-compressed compact double array (XCDA) entries
+ uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
+ charsmap_offset += sizeof(xcda_blob_size);
+ if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+ }
+
+ // Next xcda_blob_size bytes contain entries of XOR-compressed compact
+ // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
+ xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
+ xcda_array_size = xcda_blob_size / sizeof(uint32_t);
+ charsmap_offset += xcda_blob_size;
+
+ // Remaining bytes of precompiled charsmap contain null-terminated
+ // replacement strings for prefixes matched by the XCDA.
+ prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
+ prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
+ }
+
+ for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
+ const auto &token_data = vocab.id_to_token[id];
+
+ if (llama_is_normal_token(vocab, id)) {
+ min_score = std::min<float>(min_score, token_data.score);
+ max_score = std::max<float>(max_score, token_data.score);
+ }
+
+ if (llama_is_normal_token(vocab, id) ||
+ llama_is_user_defined_token(vocab, id) ||
+ llama_is_unused_token(vocab, id)) {
+ token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+ }
+
+ if (llama_is_user_defined_token(vocab, id)) {
+ user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+ }
+ }
+
+ unknown_token_score = min_score - unknown_token_score_penalty;
+ }
+
+ /* This implementation is based on SentencePiece optimized Viterbi algorithm for
+ * unigram language models. The general idea is to:
+ * - move along the input sequence in steps of one UTF code point,
+ * - at each step find all possible tokenizations of the prefix by
+ * traversing the tokens trie,
+ * - for each tokenization store the best one so far (by higher score)
+ * - use the position in sequence after given token as an index to store
+ * results
+ * - if there was no valid tokenization of the current UTF code point
+ * then use unknown token with additional score penalty
+ * After processing the whole sequence we backtrack from the end to get
+ * the best tokenization.
+ */
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ // normalize the input first
+ std::string normalized;
+ normalize(text, &normalized);
+ size_t input_len = normalized.size();
+ if (input_len == 0) {
+ return;
+ }
+
+ // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
+ // at the beginning tokenization score is zero
+ tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
+
+ for (size_t input_offset = 0; input_offset < input_len;) {
+ size_t prefix_offset = input_offset;
+ // calculate how many code units are in the currently processed UTF code point
+ size_t n_utf8_code_units = std::min<size_t>(utf8_len(normalized[input_offset]), input_len - input_offset);
+
+ // traverse the token matcher trie to find a matching token
+ bool single_codepoint_token_found = false;
+ const struct best_tokenization & current_best = tokenization_results[input_offset];
+ struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
+
+ while (prefix_offset <= input_len && node != NULL) {
+ // check if we found valid token in prefix
+ if (node->has_value) {
+ // check if it corresponds to the whole UTF code point
+ if (prefix_offset - input_offset == n_utf8_code_units) {
+ single_codepoint_token_found = true;
+ }
+ llama_token token_id = node->value;
+ const auto & token_data = vocab.id_to_token[token_id];
+
+ // we set the user-defined token scores to 0 to make them more likely to be selected
+ // (normal token scores are log probabilities, so they are negative)
+ // score type is double here to make tokenization results exactly
+ // the same as in the HF tokenizer using SentencePiece
+ const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
+ const double challenger_score = current_best.score_sum + token_score;
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+ if (challenger_score > current_champ.score_sum) {
+ struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+ current_champ = challenger;
+ }
+ }
+ node = node->traverse(normalized[prefix_offset++]);
+ }
+
+ // if we didn't find a valid token corresponding to the whole UTF code point
+ // then use unknown token as the tokenization of this UTF code point
+ if (!single_codepoint_token_found) {
+ const double challenger_score = current_best.score_sum + unknown_token_score;
+ prefix_offset = input_offset + n_utf8_code_units;
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+ if (challenger_score > current_champ.score_sum) {
+ struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
+ current_champ = challenger;
+ }
+ }
+
+ // move to the next UTF code point
+ input_offset += n_utf8_code_units;
+ }
+
+ // now backtrack from the end to gather token ids of the best tokenization
+ // merge sequences of consecutive unknown tokens into single unknown tokens
+ bool is_prev_unknown = false;
+ for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
+ bool is_unknown = tokenization.token_id == vocab.special_unk_id;
+ if (!(is_prev_unknown && is_unknown)) {
+ output.push_back(tokenization.token_id);
+ }
+ if (tokenization.input_offset == 0) {
+ break;
+ }
+ is_prev_unknown = is_unknown;
+ }
+
+ // reverse the output since we added tokens starting from the end of the input
+ std::reverse(output.begin(), output.end());
+ }
+
+private:
+ const llama_vocab & vocab;
+
+ // helper structure for returning normalization results
+ struct normalization_result {
+ const char * normalized;
+ size_t normalized_len;
+ size_t consumed_input;
+ };
+
+ void normalize(const std::string& input, std::string * normalized) {
+ normalized->clear();
+ normalized->reserve(input.size() * 3);
+
+ const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
+
+ bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+ bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+ bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
+
+ bool is_space_prepended = false;
+ bool processing_non_ws = false;
+
+ size_t input_len = input.size();
+
+ for (size_t input_offset = 0; input_offset < input_len; ) {
+ auto norm_res = normalize_prefix(input, input_offset);
+ for (size_t i = 0; i < norm_res.normalized_len; i++) {
+ char c = norm_res.normalized[i];
+ if (c != ' ') {
+ if (!processing_non_ws) {
+ processing_non_ws = true;
+ if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
+ normalized->append(space);
+ is_space_prepended = true;
+ }
+ }
+ normalized->push_back(c);
+ } else {
+ if (processing_non_ws) {
+ processing_non_ws = false;
+ }
+ if (!shall_merge_spaces) {
+ normalized->append(space);
+ }
+ }
+ }
+
+ input_offset += norm_res.consumed_input;
+ }
+
+ if (shall_append_space) {
+ normalized->append(space);
+ }
+ }
+
+ /*
+ * This structure is a view wrapper for XOR-compressed double array (XCDA)
+ * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
+ * Eeach bit-packed entry contains:
+ * - BASE array value in bits 10-30
+ * - LCHECK array value in bits 0-7
+ * - LEAF array value in bit 9
+ * Entries containing indexes of replacement sequences have set bit 31
+ */
+ struct xcda_array_view {
+ public:
+ xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+ }
+ uint32_t get_base(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
+ }
+ uint32_t get_lcheck(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return packed_node & ((1U << 31) | 0xff);
+ }
+ bool get_leaf(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return (packed_node >> 8) & 1;
+ }
+ uint32_t get_value(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return packed_node & ((1U << 31) - 1);
+ }
+ private:
+ uint32_t get_node(size_t index) {
+ if (index > xcda_array_size) {
+ throw std::runtime_error("Index out of array bounds in XCDA array!");
+ }
+ return xcda_array[index];
+ }
+ const uint32_t * xcda_array;
+ size_t xcda_array_size;
+ };
+
+ struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
+ if (input_offset == input.size()) {
+ return { &input[input_offset], 0, 0 };
+ }
+
+ // if input prefix matches some user-defined token return this token as normalization result
+ auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+ if (user_defined_token_match.second > 0) {
+ return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
+ }
+
+ size_t longest_prefix_length = 0;
+ size_t longest_prefix_offset = 0;
+
+ if (xcda_array_size > 0) {
+ struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
+
+ // Find the longest normalized sequence matching the input prefix by walking
+ // the XOR-compressed compact double array (XCDA) starting from the root node
+ // We find the index of the next node by calculating BASE[s] ^ c where s is
+ // the index of the previous node and c is a numerical character value
+ uint32_t node_index = 0;
+ // get BASE of the root node
+ node_index = xcda_view.get_base(node_index);
+ for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
+ unsigned char c = input[prefix_offset];
+ if (c == 0) {
+ break;
+ }
+ node_index ^= c;
+ // if value of LCHECK is not c it means that this is not a child of
+ // the previous node, so we stop matching
+ if (xcda_view.get_lcheck(node_index) != c) {
+ break;
+ }
+ bool is_leaf = xcda_view.get_leaf(node_index);
+ // get BASE of the current node
+ node_index ^= xcda_view.get_base(node_index);
+ // if LEAF of the current node is true, it means that its BASE points to the node
+ // containing index of replacement sequence for currently matched input prefix
+ if (is_leaf)
+ {
+ longest_prefix_length = prefix_offset - input_offset + 1;
+ // get index of replacement sequence for currently matched input prefix
+ longest_prefix_offset = xcda_view.get_value(node_index);
+ }
+ }
+ }
+
+ if (longest_prefix_length > 0) {
+ // we have a match, so return the replacement sequence
+ if (longest_prefix_offset >= prefix_replacements_size) {
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+ }
+ const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
+ return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+ } else {
+ // check if the input prefix contains a valid sequence of UTF-8 code units
+ try {
+ // if yes, return this sequence unmodified
+ size_t prefix_offset = input_offset;
+ unicode_cpt_from_utf8(input, prefix_offset);
+ return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+ } catch (std::invalid_argument & /*ex*/) {
+ // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+ return { "\xEF\xBF\xBD", 3, 1 };
+ }
+ }
+ }
+
+ // escaped space symbol - U+2581 (Lower One Eighth Block)
+ const std::string escaped_space = "\xE2\x96\x81";
+
+ const char * prefix_replacements = NULL;
+ size_t prefix_replacements_size = 0;
+
+ const uint32_t * xcda_array = NULL;
+ size_t xcda_array_size = 0;
+
+ struct naive_trie user_defined_token_matcher;
+
+ // this structure stores the best tokenization so far at input_offset
+ struct best_tokenization {
+ llama_token token_id;
+ size_t input_offset;
+ float score_sum;
+ };
+
+ float min_score = FLT_MAX;
+ float max_score = -FLT_MAX;
+
+ float unknown_token_score_penalty = 10.0;
+ float unknown_token_score;
+
+ struct naive_trie token_matcher;
+};
+
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
- bool is_prev_special = false;
+ bool is_prev_special = true; // prefix with space if first token
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
- if (vocab.tokenizer_add_space_prefix) {
- if (!output.size() || is_prev_special) { // prefix with space if first token
- raw_text = " " + raw_text;
- }
+ // prefix with space if previous is special
+ if (vocab.tokenizer_add_space_prefix && is_prev_special) {
+ raw_text = " " + raw_text;
}
#ifdef PRETOKENIZERDEBUG
llm_tokenizer_spm tokenizer(vocab);
llama_escape_whitespace(raw_text);
tokenizer.tokenize(raw_text, output);
+ is_prev_special = false;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
if (add_special) {
tokenizer.append_bos(output);
}
-
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
output.push_back(vocab.special_sep_id);
}
} break;
+ case LLAMA_VOCAB_TYPE_UGM:
+ {
+ llm_tokenizer_ugm tokenizer(vocab);
+
+ if (add_special && vocab.tokenizer_add_bos != 0) {
+ GGML_ASSERT(vocab.special_bos_id != -1);
+ output.push_back(vocab.special_bos_id);
+ }
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+ tokenizer.tokenize(raw_text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+
+ if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+
+ if (add_special && vocab.tokenizer_add_eos == 1) {
+ GGML_ASSERT(vocab.special_eos_id != -1);
+ output.push_back(vocab.special_eos_id);
+ }
+ } break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ASSERT(false);
}
continue;
}
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
- throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+ return nullptr;
}
}
const llm_arch arch = qs.model.arch;
const auto tn = LLM_TN(arch);
- auto use_more_bits = [](int i_layer, int num_layers) -> bool {
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
+ auto use_more_bits = [](int i_layer, int n_layers) -> bool {
+ return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
};
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
// sanity checks
//
- // - qs.n_attention_wv == 0 for Mamba models
- // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
+ // - qs.n_attention_wv == 0 for Mamba models
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
+ // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
//
- GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
size_t total_size_org = 0;
size_t total_size_new = 0;
quantize &= name.find("ssm_x.weight") == std::string::npos;
quantize &= name.find("ssm_dt.weight") == std::string::npos;
+ // do not quantize relative position bias (T5)
+ quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
enum ggml_type new_type;
void * new_data;
size_t new_size;
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
+ /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
params.flash_attn = false;
}
+ if (params.flash_attn && model->hparams.attn_soft_cap) {
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__);
+ params.flash_attn = false;
+ }
+
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
params.flash_attn = false;
}
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
- cparams.causal_attn = hparams.causal_attn;
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
}
}
+ if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+ cparams.causal_attn = hparams.causal_attn;
+ } else {
+ cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+ }
+
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
case LLM_ARCH_BLOOM:
case LLM_ARCH_MAMBA:
case LLM_ARCH_JINA_BERT_V2:
+ case LLM_ARCH_T5:
+ case LLM_ARCH_JAIS:
return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_OLMO:
case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK2:
+ case LLM_ARCH_CHATGLM:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
+ case LLM_ARCH_OPENELM:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
return it->second;
}
+bool llama_model_has_encoder(const struct llama_model * model) {
+ switch (model->arch) {
+ case LLM_ARCH_T5: return true;
+ default: return false;
+ }
+}
+
+llama_token llama_model_decoder_start_token(const struct llama_model * model) {
+ return model->hparams.dec_start_token_id;
+}
+
uint32_t llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const auto & hparams = ctx->model.hparams;
const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
std::vector<uint8_t> tmp_buf;
for (int il = 0; il < (int) n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
tmp_buf.resize(k_size);
const auto & hparams = ctx->model.hparams;
const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
size_t kv_buf_size;
uint32_t kv_head;
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
for (int il = 0; il < (int) n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
const auto & hparams = ctx->model.hparams;
const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
for (uint32_t i = 0; i < kv_self.size; ++i) {
const auto & cell = kv_self.cells[i];
}
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
// types of keys and values
s_cell_data_size += sizeof(int32_t) * 2;
// k_size_row and v_size_el values of layer
const auto & hparams = ctx->model.hparams;
const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
// Write the layer count
data_ctx.write(&n_layer, sizeof(n_layer));
- // Write n_embd_v_gqa
- data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+ // Write n_embd_v_gqa (reference value)
+ {
+ const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s();
+ data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+ }
// Iterate the ranges and write all the pos (this is the token position in the prompt)
for (const auto & range : cell_ranges) {
// Get whole range at a time
std::vector<uint8_t> tmp_buf;
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
// Write key type
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
data_ctx.write(&k_type_i, sizeof(k_type_i));
// TODO: simplify, reduce copy-paste
if (!kv_self.v_trans) {
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
// Write value type
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
data_ctx.write(&v_type_i, sizeof(v_type_i));
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
const uint32_t kv_size = kv_self.size;
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
// Write value type
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
data_ctx.write(&v_type_i, sizeof(v_type_i));
// Sanity check model compatibility
const auto & hparams = ctx->model.hparams;
const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
+
if (n_layer != n_layer_ref) {
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
return 0;
}
- if (n_embd_v_gqa != n_embd_v_gqa_ref) {
- LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
+
+ if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref);
return 0;
}
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
// Read type of key
int32_t k_type_i_ref;
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
// TODO: simplify, reduce copy-paste
if (!kv_self.v_trans) {
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
// Read type of value
int32_t v_type_i_ref;
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
} else {
// For each layer, read the values for each cell (transposed)
for (int il = 0; il < (int)n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
// Read type of value
int32_t v_type_i_ref;
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
if (batch.logits) free(batch.logits);
}
+int32_t llama_encode(
+ struct llama_context * ctx,
+ struct llama_batch batch) {
+ const int ret = llama_encode_internal(*ctx, batch);
+ if (ret < 0) {
+ LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
+ }
+
+ return ret;
+}
+
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
return model->vocab.special_eot_id;
}
+llama_token llama_token_pad(const struct llama_model * model) {
+ return model->vocab.special_pad_id;
+}
+
int32_t llama_tokenize(
const struct llama_model * model,
const char * text,
bool add_special,
bool parse_special) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
-
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
const auto utf8 = unicode_cpt_to_utf8(cpt);
try {
decoded_text += unicode_utf8_to_byte(utf8);
- } catch (const std::out_of_range & e) {
+ } catch (const std::out_of_range & /*e*/) {
decoded_text += "[UNK_BYTE_0x";
for (const auto c : utf8) {
decoded_text += format("%02x", (uint8_t) c);
}
// does not write null-terminator to buf
-int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
+int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
- if (!special && llama_is_control_token(model->vocab, token)) {
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
+ const llama_token_attr attr = llama_token_get_attr(model, token);
+ if (!special && (attr & attr_special)) {
return 0;
}
+ // copy piece chars to output text buffer
+ // skip up to 'lstrip' leading spaces before copying
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
+ token++;
+ size--;
+ }
+ if (length < (int32_t)size) {
+ return (int32_t) -size;
+ }
+ memcpy(buf, token, size);
+ return (int32_t) size;
+ };
+
// if we have a cache - use it
{
const auto & cache = model->vocab.cache_token_to_piece;
if (!cache.empty()) {
- const auto & res = cache.at(token);
- if (length < (int) res.size()) {
- return -(int) res.size();
- }
- memcpy(buf, res.c_str(), res.size());
- return res.size();
+ const auto & result = cache.at(token);
+ return _try_copy(result.data(), result.size());
}
}
if (0 <= token && token < llama_n_vocab(model)) {
+ const std::string & token_text = model->vocab.id_to_token[token].text;
switch (llama_vocab_get_type(model->vocab)) {
case LLAMA_VOCAB_TYPE_WPM:
- case LLAMA_VOCAB_TYPE_SPM: {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
- if (llama_is_normal_token(model->vocab, token)) {
- std::string result = model->vocab.id_to_token[token].text;
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+ return _try_copy(token_text.data(), token_text.size());
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+ std::string result = token_text;
llama_unescape_whitespace(result);
- if (length < (int) result.length()) {
- return -(int) result.length();
- }
- memcpy(buf, result.c_str(), result.length());
- return result.length();
- } else if (
- (llama_is_user_defined_token(model->vocab, token)) ||
- (llama_is_control_token (model->vocab, token) && special)) {
- std::string result = model->vocab.id_to_token[token].text;
- if (length < (int) result.length()) {
- return -(int) result.length();
- }
- memcpy(buf, result.c_str(), result.length());
- return result.length();
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
- if (length < 3) {
- return -3;
- }
- memcpy(buf, "\xe2\x96\x85", 3);
- return 3;
- } else if (llama_is_byte_token(model->vocab, token)) {
- if (length < 1) {
- return -1;
- }
- buf[0] = llama_token_to_byte(model->vocab, token);
- return 1;
+ return _try_copy(result.data(), result.size());
+ } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+ char byte = (char) llama_token_to_byte(model->vocab, token);
+ return _try_copy((char*) &byte, 1);
}
break;
}
case LLAMA_VOCAB_TYPE_BPE: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
- if (llama_is_normal_token(model->vocab, token)) {
- std::string result = model->vocab.id_to_token[token].text;
- result = llama_decode_text(result);
- if (length < (int) result.length()) {
- return -(int) result.length();
- }
- memcpy(buf, result.c_str(), result.length());
- return result.length();
- } else if (
- (llama_is_user_defined_token(model->vocab, token)) ||
- (llama_is_control_token (model->vocab, token) && special)) {
- std::string result = model->vocab.id_to_token[token].text;
- if (length < (int) result.length()) {
- return -(int) result.length();
- }
- memcpy(buf, result.c_str(), result.length());
- return result.length();
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+ return _try_copy(token_text.data(), token_text.size());
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+ std::string result = llama_decode_text(token_text);
+ return _try_copy(result.data(), result.size());
}
break;
}
return 0;
}
+int32_t llama_detokenize(
+ const struct llama_model * model,
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) {
+ int32_t avail = text_len_max;
+ int32_t total = 0;
+
+ // remove the leading space
+ bool remove_space = model->vocab.tokenizer_add_space_prefix;
+
+ if (remove_special && model->vocab.tokenizer_add_bos) {
+ if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
+ remove_space = false;
+ n_tokens--;
+ tokens++;
+ }
+ }
+
+ if (remove_special && model->vocab.tokenizer_add_eos) {
+ if (n_tokens > 0 && tokens[n_tokens-1] == model->vocab.special_eos_id) {
+ n_tokens--;
+ }
+ }
+
+ for (int32_t i = 0; i < n_tokens; ++i) {
+ GGML_ASSERT(avail >= 0);
+ int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
+ remove_space = false;
+ if (n_chars < 0) {
+ avail = 0;
+ total -= n_chars;
+ } else if (n_chars > 0) {
+ avail -= n_chars;
+ text += n_chars;
+ total += n_chars;
+ }
+ }
+
+ if (total > text_len_max) {
+ return -total;
+ }
+
+ if (model->vocab.tokenizer_clean_spaces) {
+ text -= total; // restart text
+
+ // first pass: characters ?!., //TODO: where do these characters come from?
+ const int32_t total1 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total1; ++i) {
+ const char x = text[i];
+ if (text[i - 1] == ' ') {
+ if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
+ total--; // remove space
+ }
+ }
+ text[total++] = x;
+ }
+
+ // second pass: strip single apostrophe between spaces
+ const int32_t total2 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total2; ++i) {
+ const char x = text[i];
+ if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
+ total--; // remove prev space
+ text[++i] = '\0'; // remove next space
+ }
+ text[total++] = x;
+ }
+
+ // third pass: apostrophe contractions //NOTE: this makes sense?
+ const int32_t total3 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total3; ++i) {
+ const char x = text[i];
+ if (text[i - 1] == ' ') {
+ if (x == '\'' && i + 1 < total3) {
+ const char x1 = text[i + 1];
+ if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
+ //total--; // remove space
+ } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
+ total--; // remove space
+ } else if (i + 2 < total3) {
+ const char x2 = text[i + 2];
+ if ((x1 == 'l' && x2 == 'l')) { // " 'll"
+ //total--; // remove space
+ } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
+ total--; // remove space
+ } else {
+ //total--; // remove space
+ }
+ } else {
+ //total--; // remove space
+ }
+ }
+ }
+ text[total++] = x;
+ }
+ }
+
+ return total <= text_len_max ? total : -total;
+}
+
// trim whitespace from the beginning and end of a string
static std::string trim(const std::string & str) {
size_t start = 0;
std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss;
- if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
+ auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
+ return tmpl.find(haystack) != std::string::npos;
+ };
+ if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
// chatml template
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
- } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
+ } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
// llama2 template and its variants
// [variant] support system message
- bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
+ bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
// [variant] space before + after response
- bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
+ bool space_around_response = tmpl_contains("' ' + eos_token");
// [variant] add BOS inside history
- bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
// [variant] trim spaces from the input message
- bool strip_message = tmpl.find("content.strip()") != std::string::npos;
+ bool strip_message = tmpl_contains("content.strip()");
// construct the prompt
bool is_inside_turn = true; // skip BOS at the beginning
ss << "[INST] ";
}
}
// llama2 templates seem to not care about "add_generation_prompt"
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
+ } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
// Phi 3
for (auto message : chat) {
std::string role(message->role);
if (add_ass) {
ss << "<|assistant|>\n";
}
- } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
+ } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
// zephyr template
for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
if (add_ass) {
ss << "<|assistant|>\n";
}
- } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
+ } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
if (add_ass) {
ss << "<start_of_turn>model\n";
}
- } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+ } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
// OrionStarAI/Orion-14B-Chat
std::string system_prompt = "";
for (auto message : chat) {
ss << message->content << "</s>";
}
}
- } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
+ } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
// openchat/openchat-3.5-0106,
for (auto message : chat) {
std::string role(message->role);
if (add_ass) {
ss << "GPT4 Correct Assistant:";
}
- } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
// Orca-Vicuna variant uses a system prefix
- if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
+ if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
ss << "SYSTEM: " << message->content << "\n";
} else {
ss << message->content << "\n\n";
if (add_ass) {
ss << "ASSISTANT:";
}
- } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
+ } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
// deepseek-ai/deepseek-coder-33b-instruct
for (auto message : chat) {
std::string role(message->role);
if (add_ass) {
ss << "### Response:\n";
}
- } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
+ } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
// CohereForAI/c4ai-command-r-plus
for (auto message : chat) {
std::string role(message->role);
if (add_ass) {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
}
- } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
+ } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
// Llama 3
for (auto message : chat) {
std::string role(message->role);
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
+ } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
+ // chatglm3-6b
+ ss << "[gMASK]" << "sop";
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>" << "\n " << message->content;
+ }
+ if (add_ass) {
+ ss << "<|assistant|>";
+ }
+ } else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
+ ss << "[gMASK]" << "<sop>";
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>" << "\n" << message->content;
+ }
+ if (add_ass) {
+ ss << "<|assistant|>";
+ }
+ } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "user") {
+ ss << u8"<用户>";
+ ss << trim(message->content);
+ ss << "<AI>";
+ } else {
+ ss << trim(message->content);
+ }
+ }
+ } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+ // DeepSeek-V2
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "\n\n";
+ } else if (role == "user") {
+ ss << "User: " << message->content << "\n\n";
+ } else if (role == "assistant") {
+ ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>";
+ }
+ }
+ if (add_ass) {
+ ss << "Assistant:";
+ }
} else {
// template not supported
return -1;