unicode-data.cpp
unicode.cpp
unicode.h
+ models/apertus.cpp
+ models/arcee.cpp
+ models/arctic.cpp
+ models/arwkv7.cpp
+ models/baichuan.cpp
+ models/bailingmoe.cpp
+ models/bailingmoe2.cpp
+ models/bert.cpp
+ models/bitnet.cpp
+ models/bloom.cpp
+ models/chameleon.cpp
+ models/chatglm.cpp
+ models/codeshell.cpp
+ models/cogvlm.cpp
+ models/cohere2-iswa.cpp
+ models/command-r.cpp
+ models/dbrx.cpp
+ models/deci.cpp
+ models/deepseek.cpp
+ models/deepseek2.cpp
+ models/dots1.cpp
+ models/dream.cpp
+ models/ernie4-5-moe.cpp
+ models/ernie4-5.cpp
+ models/exaone.cpp
+ models/exaone4.cpp
+ models/falcon-h1.cpp
+ models/falcon.cpp
+ models/gemma-embedding.cpp
+ models/gemma.cpp
+ models/gemma2-iswa.cpp
+ models/gemma3-iswa.cpp
+ models/gemma3n-iswa.cpp
+ models/glm4-moe.cpp
+ models/glm4.cpp
+ models/gpt2.cpp
+ models/gptneox.cpp
+ models/granite-hybrid.cpp
+ models/granite.cpp
+ models/grok.cpp
+ models/grovemoe.cpp
+ models/hunyuan-dense.cpp
+ models/hunyuan-moe.cpp
+ models/internlm2.cpp
+ models/jais.cpp
+ models/jamba.cpp
+ models/lfm2.cpp
+ models/llada-moe.cpp
+ models/llada.cpp
+ models/llama-iswa.cpp
+ models/llama.cpp
+ models/mamba.cpp
+ models/minicpm3.cpp
+ models/minimax-m2.cpp
+ models/mpt.cpp
+ models/nemotron-h.cpp
+ models/nemotron.cpp
+ models/neo-bert.cpp
+ models/olmo.cpp
+ models/olmo2.cpp
+ models/olmoe.cpp
+ models/openai-moe-iswa.cpp
+ models/openelm.cpp
+ models/orion.cpp
+ models/phi2.cpp
+ models/phi3.cpp
+ models/plamo.cpp
+ models/plamo2.cpp
+ models/plm.cpp
+ models/qwen.cpp
+ models/qwen2.cpp
+ models/qwen2moe.cpp
+ models/qwen2vl.cpp
+ models/qwen3.cpp
+ models/qwen3vl.cpp
+ models/qwen3vl-moe.cpp
+ models/qwen3moe.cpp
+ models/refact.cpp
+ models/rwkv6-base.cpp
+ models/rwkv6.cpp
+ models/rwkv6qwen2.cpp
+ models/rwkv7-base.cpp
+ models/rwkv7.cpp
+ models/seed-oss.cpp
+ models/smallthinker.cpp
+ models/smollm3.cpp
+ models/stablelm.cpp
+ models/starcoder.cpp
+ models/starcoder2.cpp
+ models/t5-dec.cpp
+ models/t5-enc.cpp
+ models/wavtokenizer-dec.cpp
+ models/xverse.cpp
+ models/graph-context-mamba.cpp
)
target_include_directories(llama PRIVATE .)
#include "ggml-cpp.h"
+#include "models/models.h"
+
#include <algorithm>
#include <cassert>
#include <cfloat>
return layers[il].rope_short;
}
-struct llm_build_llama : public llm_graph_context {
- llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- if (hparams.use_kq_norm) {
- // Llama4TextL2Norm
- Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
- Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
- cb(Qcur, "Qcur_normed", il);
- cb(Kcur, "Kcur_normed", il);
- }
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_llama_iswa : public llm_graph_context {
- llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- // temperature tuning
- ggml_tensor * inp_attn_scale = nullptr;
- inp_attn_scale = build_inp_attn_scale();
-
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
- (il + 1) % hparams.n_no_rope_layer_step != 0;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (use_rope) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- } else if (inp_attn_scale) {
- Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- if (use_rope && hparams.use_kq_norm) {
- // Llama4TextL2Norm
- Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
- Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
- cb(Qcur, "Qcur_normed", il);
- cb(Kcur, "Kcur_normed", il);
- }
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
- il);
-
- // Shared experts
- ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(shexp_out, "ffn_moe_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, shexp_out);
- cb(cur, "ffn_moe_out_merged", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_deci : public llm_graph_context {
- llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- const int64_t n_head_kv = hparams.n_head_kv(il);
- const int64_t n_head = hparams.n_head(il);
- const int64_t n_ff = hparams.n_ff(il);
-
- if (n_head == 0) {
- // attention-free layer of Llama-3_1-Nemotron-51B
- cur = inpL;
- } else {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
-
- if (n_head > 0 && n_head_kv == 0) {
- // "linear attention" of Llama-3_1-Nemotron-51B
- cur = build_lora_mm(model.layers[il].wo, cur);
- cb(cur, "wo", il);
- } else if (n_head > 0) {
- // self-attention
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
- if (n_ff == 0) {
- continue;
- }
-
- // modified to support attention-free layer of Llama-3_1-Nemotron-51B
- ggml_tensor * ffn_inp = cur;
- if (n_head > 0) {
- ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- }
-
- // feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_baichuan : public llm_graph_context {
- llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- switch (model.type) {
- case LLM_TYPE_7B:
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- break;
- case LLM_TYPE_13B:
- break;
- default:
- GGML_ABORT("fatal error");
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_xverse : public llm_graph_context {
- llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_falcon : public llm_graph_context {
- llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * attn_norm;
-
- attn_norm = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm, "attn_norm", il);
-
- // self-attention
- {
- if (model.layers[il].attn_norm_2) {
- // Falcon-40B
- cur = build_norm(inpL,
- model.layers[il].attn_norm_2,
- model.layers[il].attn_norm_2_b,
- LLM_NORM, il);
- cb(cur, "attn_norm_2", il);
- } else {
- cur = attn_norm;
- }
-
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- // using mode = 2 for neox mode
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = cur;
-
- // feed forward
- {
- cur = build_ffn(attn_norm, // !! use the attn norm, not the result
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = ggml_add(ctx0, cur, inpL);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- // norm
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_grok : public llm_graph_context {
- llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_out_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_out_norm", il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // MoE branch
- ggml_tensor * moe_out = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_GELU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- if (model.layers[il].ffn_up) {
- ggml_tensor * ffn_out = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(ffn_out, "ffn_out", il);
-
- cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
- cb(cur, "ffn_out", il);
- } else {
- cur = moe_out;
- }
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_post_norm", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
-
- // final logit soft-capping
- if (hparams.f_final_logit_softcapping) {
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
- cur = ggml_tanh(ctx0, cur);
- cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_dbrx : public llm_graph_context {
- llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
-
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(cur, "wqkv_clamped", il);
-
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].attn_out_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_out_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_starcoder : public llm_graph_context {
- llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
-
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_refact : public llm_graph_context {
- llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_bert : public llm_graph_context {
- llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * inp_pos = nullptr;
-
- if (model.arch != LLM_ARCH_JINA_BERT_V2) {
- inp_pos = build_inp_pos();
- }
-
- // construct input embeddings (token, type, position)
- inpL = build_inp_embd(model.tok_embd);
-
- // token types are hardcoded to zero ("Sentence A")
- if (model.type_embd) {
- ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
- inpL = ggml_add(ctx0, inpL, type_row0);
- }
- if (model.arch == LLM_ARCH_BERT) {
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
- }
- cb(inpL, "inp_embd", -1);
-
- // embed layer norm
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
- cb(inpL, "inp_norm", -1);
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * cur = inpL;
-
- {
- ggml_tensor * Qcur;
- ggml_tensor * Kcur;
- ggml_tensor * Vcur;
-
- // self-attention
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
-
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- }
-
- if (model.layers[il].attn_q_norm) {
- Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- }
-
- if (model.layers[il].attn_k_norm) {
- Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
-
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- }
-
- // RoPE
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "kqv_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // re-add the layer input
- cur = ggml_add(ctx0, cur, inpL);
-
- // attention layer norm
- cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
-
- if (model.layers[il].attn_norm_2 != nullptr) {
- cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
- cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
- }
-
- ggml_tensor * ffn_inp = cur;
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- nullptr,
- model.layers[il].ffn_down_exps,
- nullptr,
- hparams.n_expert,
- hparams.n_expert_used,
- LLM_FFN_GELU,
- false, false,
- 0.0f,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
- cb(cur, "ffn_moe_out", il);
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- // attentions bypass the intermediate layer
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- // output layer norm
- cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cb(cur, "result_embd", -1);
- res->t_embd = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_neo_bert : public llm_graph_context {
- llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * inp_pos = build_inp_pos();
-
- // construct input embeddings (token, type, position)
- inpL = build_inp_embd(model.tok_embd);
- cb(inpL, "inp_embd", -1);
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * cur = inpL;
-
- // pre-norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
-
- {
- ggml_tensor * Qcur;
- ggml_tensor * Kcur;
- ggml_tensor * Vcur;
-
- // self-attention
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- // RoPE
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "kqv_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // re-add the layer input
- cur = ggml_add(ctx0, cur, inpL);
-
- ggml_tensor * ffn_inp = cur;
- cb(ffn_inp, "ffn_inp", il);
-
- // pre-norm
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- cur = build_ffn(cur,
- model.layers[il].ffn_up,
- NULL, NULL, NULL, NULL, NULL,
- model.layers[il].ffn_down,
- NULL, NULL, NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
-
- // attentions bypass the intermediate layer
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm_enc, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_embd", -1);
- res->t_embd = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_bloom : public llm_graph_context {
- llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp_attn = build_attn_inp_kv();
-
- inpL = build_norm(inpL,
- model.tok_norm,
- model.tok_norm_b,
- LLM_NORM, -1);
- cb(inpL, "inp_norm", -1);
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_mpt : public llm_graph_context {
- llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * pos;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp_attn = build_attn_inp_kv();
-
- if (model.pos_embd) {
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
-
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
- }
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * attn_norm;
-
- attn_norm = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm, "attn_norm", il);
-
- // self-attention
- {
- cur = attn_norm;
-
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- if (model.layers[il].bqkv){
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
-
- if (hparams.f_clamp_kqv > 0.0f) {
- cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(cur, "wqkv_clamped", il);
- }
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- // Q/K Layernorm
- if (model.layers[il].attn_q_norm) {
- Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
- Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed forward
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- model.layers[il].ffn_act,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_stablelm : public llm_graph_context {
- llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- ggml_tensor * inpSA = cur;
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- NULL,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
-
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- NULL,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- if (model.layers[il].ffn_norm) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- } else {
- // parallel residual
- cur = inpSA;
- }
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen : public llm_graph_context {
- llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
-
- // using mode = 2 for neox mode
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward forward
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen2 : public llm_graph_context {
- llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- if (model.output_b != nullptr) {
- cur = ggml_add(ctx0, cur, model.output_b);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_dream : public llm_graph_context {
- llm_build_dream(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context(params) {
- //copied from qwen2
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_llada : public llm_graph_context {
- llm_build_llada(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context(params) {
- // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- // Non-causal attention for diffusion
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen2vl : public llm_graph_context {
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- int sections[4];
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_multi(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_multi(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen2moe : public llm_graph_context {
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // FFN shared expert
- {
- ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
- cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
-
- // sigmoid
- ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
- cb(cur_gate, "ffn_shexp_gate", il);
-
- ggml_tensor * cur_ffn = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur_ffn, "ffn_shexp", il);
-
- ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
- cb(ffn_shexp_out, "ffn_shexp_out", il);
-
- moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
- cb(moe_out, "ffn_out", il);
-
- cur = moe_out;
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen3 : public llm_graph_context {
- llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen3moe : public llm_graph_context {
- llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- cur = moe_out;
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen3vl : public llm_graph_context {
- llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-
- const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
- const size_t n_deepstack_layers = hparams.n_deepstack_layers;
- const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- int sections[4];
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
- std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
-
- if (ubatch.embd) {
- // Image input: split main embd and deepstack embds
- ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
- for (size_t i = 0; i < n_deepstack_layers; i++) {
- deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
- }
- inpL = inpL_main;
- }
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_multi(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_multi(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- if (ubatch.embd && (size_t)il < n_deepstack_layers) {
- cur = ggml_add(ctx0, cur, deepstack_features[il]);
- cb(cur, "deepstack_out", il);
- }
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_qwen3vlmoe : public llm_graph_context {
- llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
- const size_t n_deepstack_layers = hparams.n_deepstack_layers;
- const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- int sections[4];
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
- std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
-
- if (ubatch.embd) {
- // Image input: split main embd and deepstack embds
- ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
- for (size_t i = 0; i < n_deepstack_layers; i++) {
- deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
- }
- inpL = inpL_main;
- }
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_multi(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_multi(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- cur = moe_out;
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- if (ubatch.embd && (size_t)il < n_deepstack_layers) {
- cur = ggml_add(ctx0, cur, deepstack_features[il]);
- cb(cur, "deepstack_out", il);
- }
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_phi2 : public llm_graph_context {
- llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * attn_norm_output;
- ggml_tensor * ffn_output;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- attn_norm_output = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm_output, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
-
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- // with phi2, we scale the Q to avoid precision issues
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
- }
-
- // FF
- {
- ffn_output = build_ffn(attn_norm_output,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(ffn_output, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_output);
- cur = ggml_add(ctx0, cur, inpL);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output_no_bias", -1);
-
- cur = ggml_add(ctx0, cur, model.output_b);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-template<bool iswa>
-struct llm_build_phi3 : public llm_graph_context {
- llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
- inp_attn_type * inp_attn = nullptr;
-
- if constexpr (iswa) {
- inp_attn = build_attn_inp_kv_iswa();
- } else {
- inp_attn = build_attn_inp_kv();
- }
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- auto * residual = inpL;
-
- // self-attention
- {
- // rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- ggml_tensor* attn_norm_output = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM_RMS, il);
- cb(attn_norm_output, "attn_norm", il);
-
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
-
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
- cb(cur, "wqkv", il);
-
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
- cb(Qcur, "Qcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- }
-
- cur = ggml_add(ctx0, cur, residual);
- residual = cur;
-
- cur = build_norm(cur,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- }
-
- cur = ggml_add(ctx0, residual, cur);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- if (model.output_b != nullptr) {
- cb(cur, "result_output_no_bias", -1);
- cur = ggml_add(ctx0, cur, model.output_b);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_plamo : public llm_graph_context {
- llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- ggml_tensor * sa_inp = cur;
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- ggml_tensor * sa_out = cur;
-
- cur = sa_inp;
-
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, sa_out);
- cur = ggml_add(ctx0, cur, inpL);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gpt2 : public llm_graph_context {
- llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * pos;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
-
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_codeshell : public llm_graph_context {
- llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_orion : public llm_graph_context {
- llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- // if (model.layers[il].bq) {
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- // cb(Qcur, "Qcur", il);
- // }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- // if (model.layers[il].bk) {
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- // cb(Kcur, "Kcur", il);
- // }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- // if (model.layers[il].bv) {
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- // cb(Vcur, "Vcur", il);
- // }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_internlm2 : public llm_graph_context {
- llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_minicpm3 : public llm_graph_context {
- llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- //TODO: if the model varies, these parameters need to be read from the model
- const int64_t n_embd_base = 256;
- const float scale_embd = 12.0f;
- const float scale_depth = 1.4f;
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
-
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // scale the input embeddings
- inpL = ggml_scale(ctx0, inpL, scale_embd);
- cb(inpL, "inp_scaled", -1);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- ggml_tensor * q = NULL;
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
- q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
- cb(q, "q", il);
-
- q = build_norm(q,
- model.layers[il].attn_q_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(q, "q", il);
-
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
- q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
- cb(q, "q", il);
-
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- 0);
- cb(q_nope, "q_nope", il);
-
- // and {n_head * n_embd_head_qk_rope, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
-
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
-
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
- kv_pe_compresseed->nb[1],
- 0);
- cb(kv_compressed, "kv_compressed", il);
-
- // and {n_embd_head_qk_rope, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
- kv_pe_compresseed->nb[1],
- kv_pe_compresseed->nb[1],
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
-
- kv_compressed = build_norm(kv_compressed,
- model.layers[il].attn_kv_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(kv_compressed, "kv_compressed", il);
-
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
- cb(kv, "kv", il);
-
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- 0);
- cb(k_nope, "k_nope", il);
-
- // and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
- cb(v_states, "v_states", il);
-
- v_states = ggml_cont(ctx0, v_states);
- cb(v_states, "v_states", il);
-
- q_pe = ggml_rope_ext(
- ctx0, q_pe, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
-
- // shared RoPE key
- k_pe = ggml_rope_ext(
- ctx0, k_pe, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
-
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
- cb(q_states, "q_states", il);
-
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
- cb(k_states, "k_states", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // scale_res - scale the hidden states for residual connection
- const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
- cur = ggml_scale(ctx0, cur, scale_res);
- cb(cur, "hidden_scaled", il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- // scale the hidden states for residual connection
- cur = ggml_scale(ctx0, cur, scale_res);
- cb(cur, "hidden_scaled_ffn", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head scaling
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
- cur = ggml_scale(ctx0, cur, scale_lmhead);
- cb(cur, "lmhead_scaling", -1);
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gemma : public llm_graph_context {
- llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
- cb(Qcur, "Qcur_scaled", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
-
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, sa_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gemma2_iswa : public llm_graph_context {
- llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
-
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
-
- cur = ggml_add(ctx0, cur, sa_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- // final logit soft-capping
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
- cur = ggml_tanh(ctx0, cur);
- cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gemma3_iswa : public llm_graph_context {
- llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- // TODO: is causal == true correct? might need some changes
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const float freq_base_l = model.get_rope_freq_base (cparams, il);
- const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
- Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
-
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
-
- cur = ggml_add(ctx0, cur, sa_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gemma3n_iswa : public llm_graph_context {
- const llama_model & model;
-
- const int64_t n_embd_head;
- const int64_t n_embd_altup;
- const int64_t n_altup;
- const int i_altup_act;
- const int n_layer_sparsity = 10; // number of layers using activation sparsity
- const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
-
- llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
- : llm_graph_context(params),
- model(model),
- n_embd_head(model.hparams.n_embd_head_k),
- n_embd_altup(model.hparams.n_embd_altup),
- n_altup(model.hparams.n_altup),
- i_altup_act(model.hparams.i_altup_act) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- // TODO: is causal == true correct? might need some changes
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
- ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
-
- // inpL now has only 1 altup, project it to the rest of the altups
- // these "added" altups will be concat to the last dim of inpL
- {
- ggml_tensor * target_magnitude = calc_magnitude(inpL);
- ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
- ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
- ggml_tensor * new_magnitude = calc_magnitude(altup_added);
- altup_added = ggml_div(ctx0,
- ggml_mul(ctx0, altup_added, target_magnitude),
- new_magnitude);
- inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
- cb(inpL, "inp_stacked", -1);
- }
-
- // inpL now has shape: [n_embd, n_tokens, n_altup]
- // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
-
- for (int il = 0; il < n_layer; ++il) {
- // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
- const float freq_base_l = model.get_rope_freq_base (cparams, il);
- const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
- ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
- ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
-
- // predicted value will go through self-attention and laurel
- ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
- cur = active_prediction;
- cb(cur, "active_prediction", il);
-
- // norm
- cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // laurel
- ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
-
- // self-attention
- if (hparams.has_kv(il)) {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
-
- cb(Qcur, "Qcur_normed", il);
- cb(Kcur, "Kcur_normed", il);
- cb(Vcur, "Vcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur_pos", il);
- cb(Kcur, "Kcur_pos", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
- } else {
- // reuse KV cache of earlier layers
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
- cb(Qcur, "Qcur_pos", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
- cb(cur, "attn_gated", il);
-
- ggml_tensor * attn_laurel = ggml_scale(ctx0,
- ggml_add(ctx0, cur, laurel_out),
- 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
- cb(attn_laurel, "attn_laurel", il);
-
- cur = build_norm(attn_laurel,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- {
- ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
- ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
-
- if (il < n_layer_sparsity) {
- // apply activation sparsity
- gate_proj = gaussian_topk(gate_proj);
- }
- gate_proj = ggml_gelu(ctx0, gate_proj);
-
- cur = ggml_mul(ctx0, up_proj, gate_proj);
- cur = build_lora_mm(model.layers[il].ffn_down, cur);
- cb(cur, "ffn_out", il);
- }
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", il);
-
- ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
- cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
-
- ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
-
- ggml_tensor * first_prediction; // [n_embd, n_tokens]
- {
- first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
- first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
- first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
- first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
- cb(first_prediction, "first_prediction_gated", il);
- ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
- first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
- cb(first_prediction, "first_prediction_scaled", il);
-
- first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
- first_prediction = build_norm(first_prediction,
- model.layers[il].per_layer_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(first_prediction, "first_prediction_out", il);
- }
-
- // equivalent to python code: corrected_predictions[1:] += first_prediction
- {
- ggml_tensor * slice_first = view_2d_slice(corrected, 0);
- ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
- ggml_row_size(corrected->type, n_embd),
- ggml_row_size(corrected->type, n_embd*n_tokens),
- n_embd*n_tokens*ggml_element_size(corrected));
- ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
- corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
- }
-
- cur = corrected; // [n_embd, n_tokens, n_altup]
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL; // [n_embd, n_tokens, n_altup]
-
- // cur now has multiple altup(s), we want to merge them back to 1 altup
- {
- ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
- // do a view to skip the first slice (active altup)
- ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
- ggml_row_size(cur->type, n_embd),
- ggml_row_size(cur->type, n_embd*n_tokens),
- n_embd*n_tokens*ggml_element_size(cur));
- ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
- ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
- altup_unembd = ggml_div(ctx0,
- ggml_mul(ctx0, altup_unembd, target_magnitude),
- new_magnitude);
- cb(altup_unembd, "altup_unembd", -1);
-
- // equivalent to torch.mean(hidden_states, dim=0)
- cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
- for (int i = 0; i < n_altup - 1; ++i) {
- cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
- }
- cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
- cb(cur, "unembd_merged", -1);
- }
-
- // cur now has shape: [n_embd, n_tokens]
-
- // TODO: move this to right after the last KV layer
- {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- {
- // final logit soft-capping
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
- cur = ggml_tanh(ctx0, cur);
- cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
- ggml_tensor * calc_magnitude(ggml_tensor * x) {
- return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
- }
-
- // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
- ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
- GGML_ASSERT(idx < (int)x->ne[2]);
- return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
- ggml_row_size(x->type, x->ne[0]),
- idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
- }
-
- // equivalent to get_per_layer_inputs() in python code
- // output shape: [n_embd_altup, n_layer, n_tokens]
- ggml_tensor * get_per_layer_inputs() {
- auto inp = std::make_unique<llm_graph_input_embd>();
- ggml_tensor * inp_per_layer;
- if (ubatch.token) {
- inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
- ggml_set_input(inp->tokens);
- res->t_tokens = inp->tokens;
- inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
- inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
- inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
- cb(inp_per_layer, "inp_per_layer_selected", -1);
- } else {
- GGML_ABORT("TODO: support embd input");
- }
- res->add_input(std::move(inp));
- return inp_per_layer;
- }
-
- // equivalent to project_per_layer_inputs() in python code
- // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
- // output shape: [n_embd_altup, n_tokens, n_layer]
- ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
- const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
- const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
-
- ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
- per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
- per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
- per_layer_proj = build_norm(per_layer_proj,
- model.per_layer_proj_norm, NULL,
- LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
- cb(per_layer_proj, "per_layer_proj", -1);
-
- inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
- inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
- cb(inp_per_layer, "inp_per_layer", -1);
-
- // permute to shape: [n_embd_altup, n_tokens, n_layer]
- inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
- return inp_per_layer;
- }
-
- // input cur shape: [n_altup, n_tokens]
- // output shape: [n_altup, n_tokens]
- ggml_tensor * laurel(ggml_tensor * cur, int il) {
- ggml_tensor * tmp = cur;
- tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
- tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
- tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
- tmp = ggml_add(ctx0, tmp, cur);
- cb(tmp, "laurel_out", il);
- return tmp;
- }
-
- // input x shape: [n_embd, n_tokens]
- // output shape: [n_embd, n_tokens]
- ggml_tensor * gaussian_topk(ggml_tensor * x) {
- ggml_tensor * mean = ggml_mean(ctx0, x);
- ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
- ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
- 1.0f / (float)(x->ne[0] - 1)
- ));
- ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
- return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
- }
-
- //
- // altup functions
- //
-
- // equivalent to compute_router_modalities() in python code
- // input x shape: [n_embd, n_tokens]
- // output shape: [n_altup, n_tokens]
- ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
- ggml_tensor * router_inputs = build_norm(x,
- model.layers[il].altup_router_norm, NULL,
- LLM_NORM_RMS, il);
-
- // router_input_scale
- router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
-
- ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
- return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
- }
-
- // input cur shape: [n_embd, n_tokens, n_altup]
- // output shape: [n_embd, n_tokens, n_altup]
- ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
- ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
- ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
- cb(modalities, "modalities", il);
-
- ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
- cb(all_coefs, "all_coefs", il);
- // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
- all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
-
- // permute to [n_altup, n_embd, n_tokens]
- ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
- ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
-
- // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
- predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
- predictions = ggml_add(ctx0, predictions, cur);
- cb(predictions, "predictions", il);
-
- return predictions;
- }
-
- // input predictions shape: [n_embd, n_tokens, n_altup]
- // input activated shape: [n_embd, n_tokens]
- // output shape: [n_embd, n_tokens, n_altup]
- ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
- ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
- cb(modalities, "modalities", il);
-
- ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
- ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
- cb(innovation, "innovation", il);
-
- ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
- all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
- cb(all_coefs, "all_coefs", il);
- all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
- all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
-
- innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
- ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
- corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
- cb(corrected, "corrected", il);
-
- return corrected;
- }
-};
-
-struct llm_build_gemma_embedding : public llm_graph_context {
- llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const float freq_base_l = model.get_rope_freq_base (cparams, il);
- const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
- Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
-
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
-
- cur = ggml_add(ctx0, cur, sa_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-// TODO: move up next to build_starcoder
-struct llm_build_starcoder2 : public llm_graph_context {
- llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_graph_context_mamba : public llm_graph_context {
- llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
-
- ggml_tensor * build_mamba_layer(
- llm_graph_input_rs * inp,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) {
-
- const auto * mctx_cur = inp->mctx;
-
- const auto kv_head = mctx_cur->get_head();
-
- const auto & layer = model.layers[il];
-
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t dt_rank = hparams.ssm_dt_rank;
- const int64_t n_head = d_inner;
- const int64_t head_dim = 1;
- const int64_t n_seqs = ubatch.n_seqs;
- // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
- const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
-
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
- ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
- ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
- // split the above in two
- // => {d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
- ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
-
- // conv
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
-
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all,
- (d_conv - 1)*(d_inner)*(n_seqs),
- kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
-
- // 1D convolution
- // The equivalent is to make a self-overlapping view of conv_x
- // over d_conv columns at each stride in the 3rd dimension,
- // then element-wise multiply that with the conv1d weight,
- // then sum the elements of each row,
- // (the last two steps are a dot product over rows (also doable with mul_mat))
- // then permute away the ne[0] dimension,
- // and then you're left with the resulting x tensor.
- // For simultaneous sequences, all sequences need to have the same length.
- x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
-
- // bias
- x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
-
- x = ggml_silu(ctx0, x);
- }
-
- // ssm
- {
- // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
- ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
- // split
- ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
- ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
- ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
-
- // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
- if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
- dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
- B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
- C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
- }
-
- // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
- dt = build_lora_mm(layer.ssm_dt, dt);
- dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
-
- cur = x;
- x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
-
- ggml_tensor * A = layer.ssm_a;
-
- // use the states and the indices provided by build_recurrent_state
- // (this is necessary in order to properly use the states before they are overwritten,
- // while avoiding to make unnecessary copies of the states)
- auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
-
- // Custom operator to optimize the parallel associative scan
- // as described in the Annex D of the Mamba paper.
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
- };
-
- ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
-
- // store last states
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0,
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
- ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
-
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
-
- // TODO: skip computing output earlier for unused tokens
-
- y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
- y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
-
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(layer.ssm_out, y);
- }
-
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
-
- return cur;
- }
-
- ggml_tensor * build_mamba2_layer(
- llm_graph_input_rs * inp,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) const {
-
- const auto * mctx_cur = inp->mctx;
-
- const auto kv_head = mctx_cur->get_head();
-
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t n_head = hparams.ssm_dt_rank;
- const int64_t head_dim = d_inner / n_head;
- const int64_t n_group = hparams.ssm_n_group;
- const int64_t n_seqs = ubatch.n_seqs;
-
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
- ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
- ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-
- // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
- ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
-
- // split the above in three
- ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
- ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
- ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
-
- // conv
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
-
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all,
- (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
- kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
-
- // 1D convolution
- // The equivalent is to make a self-overlapping view of conv_x
- // over d_conv columns at each stride in the 3rd dimension,
- // then element-wise multiply that with the conv1d weight,
- // then sum the elements of each row,
- // (the last two steps are a dot product over rows (also doable with mul_mat))
- // then permute away the ne[0] dimension,
- // and then you're left with the resulting x tensor.
- // For simultaneous sequences, all sequences need to have the same length.
- xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
-
- // bias
- xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
-
- xBC = ggml_silu(ctx0, xBC);
- }
-
- // ssm
- {
- // These correspond to V K Q in SSM/attention duality
- ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
- ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
- ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
-
- // {n_head, n_seq_tokens, n_seqs}
- dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
-
- ggml_tensor * A = model.layers[il].ssm_a;
-
- // use the states and the indices provided by build_recurrent_state
- // (this is necessary in order to properly use the states before they are overwritten,
- // while avoiding to make unnecessary copies of the states)
- auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
-
- // TODO: use semistructured matrices to implement state-space duality
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
- };
-
- ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
-
- // store last states
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0,
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
- ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
-
- ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
-
- // TODO: skip computing output earlier for unused tokens
-
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
- cb(y, "mamba2_y_add_d", il);
- y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
-
- // grouped RMS norm
- if (model.layers[il].ssm_norm) {
- y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
- y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
- }
-
- y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
-
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(model.layers[il].ssm_out, y);
- }
-
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
- cb(cur, "mamba_out", il);
-
- return cur;
- }
-};
-
-struct llm_build_mamba : public llm_graph_context_mamba {
- llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
-
- auto * rs_inp = build_rs_inp();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- if (model.arch == LLM_ARCH_MAMBA2) {
- cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
- } else {
- cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // residual
- cur = ggml_add(ctx0, cur, inpL);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- // final rmsnorm
- cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
-};
-
-struct llm_build_jamba : public llm_graph_context_mamba {
- llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp_hybrid = build_inp_mem_hybrid();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const int64_t n_head_kv = hparams.n_head_kv(il);
-
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- if (n_head_kv == 0) {
- cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
- } else {
- // Attention
-
- struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- // No RoPE :)
- cur = build_attn(inp_hybrid->get_attn(),
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // residual
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
- cb(cur, "ffn_inp", il);
-
- cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- // FFN
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- }
-
- // residual
- cur = ggml_add(ctx0, ffn_inp, cur);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- // final rmsnorm
- cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_command_r : public llm_graph_context {
- llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- const float f_logit_scale = hparams.f_logit_scale;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- ggml_tensor * ffn_inp = cur;
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- NULL,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- NULL,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
-
- ggml_tensor * attn_out = cur;
-
- // feed-forward network
- {
- cur = build_ffn(ffn_inp,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- // add together residual + FFN + self-attention
- cur = ggml_add(ctx0, cur, inpL);
- cur = ggml_add(ctx0, cur, attn_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- if (f_logit_scale) {
- cur = ggml_scale(ctx0, cur, f_logit_scale);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_cohere2_iswa : public llm_graph_context {
- llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- const float f_logit_scale = hparams.f_logit_scale;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const bool is_swa = hparams.is_swa(il);
-
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
- cb(cur, "attn_norm", il);
- ggml_tensor * ffn_inp = cur;
-
- // self-attention
- {
- // rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (is_swa) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
-
- ggml_tensor * attn_out = cur;
-
- // feed-forward network
- {
- cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
- NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
- il);
- cb(cur, "ffn_out", il);
- }
-
- // add together residual + FFN + self-attention
- cur = ggml_add(ctx0, cur, inpL);
- cur = ggml_add(ctx0, cur, attn_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- if (f_logit_scale) {
- cur = ggml_scale(ctx0, cur, f_logit_scale);
- }
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-// ref: https://allenai.org/olmo
-// based on the original build_llama() function, changes:
-// * non-parametric layer norm
-// * clamp qkv
-// * removed bias
-// * removed MoE
-struct llm_build_olmo : public llm_graph_context {
- llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- NULL, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- NULL, NULL,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- NULL, NULL,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-template <bool iswa>
-struct llm_build_olmo2 : public llm_graph_context {
- llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
- inp_attn_type * inp_attn = nullptr;
-
- if constexpr (iswa) {
- inp_attn = build_attn_inp_kv_iswa();
- } else {
- inp_attn = build_attn_inp_kv();
- }
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = inpL;
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- const bool is_swa = hparams.is_swa(il);
-
- if (is_swa) {
- // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
- // This is achieved here by setting freq_scale and attn_factor to 1.
- // We also set ext_factor to 0 to avoid a few unnecessary computations.
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
- 0.0, 1.0, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
- 0.0, 1.0, beta_fast, beta_slow
- );
- } else {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_ffn(ffn_inp,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-// based on the build_qwen2moe() function, changes:
-// * removed shared experts
-// * removed bias
-// * added q, k norm
-struct llm_build_olmoe : public llm_graph_context {
- llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_llada_moe : public llm_graph_context {
- llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_openelm : public llm_graph_context {
- llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const int64_t n_head = hparams.n_head(il);
- const int64_t n_head_kv = hparams.n_head_kv(il);
- const int64_t n_head_qkv = 2*n_head_kv + n_head;
-
- cur = inpL;
- ggml_tensor * residual = cur;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
- cb(Vcur, "Vcur", il);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur", il);
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, NULL,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, NULL,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Qcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- inpL = cur;
- }
-
- cur = inpL;
-
- // norm
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_gptneox : public llm_graph_context {
- llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // ffn
- if (hparams.use_par_res) {
- // attention and ffn are computed in parallel
- // x = x + attn(ln1(x)) + ffn(ln2(x))
-
- ggml_tensor * attn_out = cur;
-
- cur = build_norm(inpL,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, inpL);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, attn_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- } else {
- // attention and ffn are computed sequentially
- // x = x + attn(ln1(x))
- // x = x + ffn(ln2(x))
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_arctic : public llm_graph_context {
- llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
- cb(ffn_out, "ffn_out", il);
-
- // MoE
- cur = build_norm(inpSA,
- model.layers[il].ffn_norm_exps, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm_exps", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_out);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_deepseek : public llm_graph_context {
- llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, hparams.expert_weights_scale,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_deepseek2 : public llm_graph_context {
- llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- bool is_lite = (hparams.n_layer == 27);
-
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
-
- // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
- const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
- const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
-
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
-
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
- // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
- // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
- const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
- const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- ggml_tensor * q = NULL;
- if (!is_lite) {
- q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
- cb(q, "q", il);
-
- q = build_norm(q,
- model.layers[il].attn_q_a_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(q, "q", il);
-
- q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
- cb(q, "q", il);
- } else {
- q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- cb(q, "q", il);
- }
-
- // split into {n_embd_head_qk_nope, n_head, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
- n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, n_embd_head_k),
- ggml_row_size(q->type, n_embd_head_k) * n_head,
- 0);
- cb(q_nope, "q_nope", il);
-
- // and {n_embd_head_qk_rope, n_head, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
- n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, n_embd_head_k),
- ggml_row_size(q->type, n_embd_head_k) * n_head,
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
-
- ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_cmpr_pe, "kv_cmpr_pe", il);
-
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
- kv_lora_rank, n_tokens,
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- 0);
- cb(kv_cmpr, "kv_cmpr", il);
-
- // and {n_embd_head_qk_rope, 1, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
- n_embd_head_qk_rope, 1, n_tokens,
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
-
- q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
-
- k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
-
- kv_cmpr = build_norm(kv_cmpr,
- model.layers[il].attn_kv_a_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(kv_cmpr, "kv_cmpr", il);
-
- if (is_mla) {
- // {n_embd_head_qk_nope, n_tokens, n_head}
- q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
- cb(q_nope, "q_nope_perm", il);
-
- // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
- ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
- cb(q_nope_absorbed, "q_nope_absorbed", il);
-
- // {kv_lora_rank, n_head, n_tokens}
- q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
- cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
-
- // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
- // note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
- cb(Qcur, "Qcur", il);
-
- kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
- cb(kv_cmpr, "kv_cmpr_reshape", il);
-
- // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
- ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
- cb(Kcur, "Kcur", il);
-
- // {kv_lora_rank, 1, n_tokens}
- ggml_tensor * Vcur = kv_cmpr;
- cb(Vcur, "Vcur", il);
-
- // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
- } else {
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
- cb(kv, "kv", il);
-
- // split into {n_embd_head_qk_nope, n_head, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
- n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
- 0);
- cb(k_nope, "k_nope_view", il);
-
- // and {n_embd_head_v, n_head, n_tokens}
- ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
- n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
- ggml_row_size(kv->type, n_embd_head_qk_nope));
- cb(Vcur, "Vcur_view", il);
-
- Vcur = ggml_cont(ctx0, Vcur);
- cb(Vcur, "Vcur_cont", il);
-
- // note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
- cb(Kcur, "Kcur", il);
-
- // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- }
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = ggml_mul_mat(ctx0, model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_bitnet : public llm_graph_context {
- llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_scale) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
- }
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- // B1.K
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_scale) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
- }
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- // B1.V
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_scale) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
- }
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- NULL, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
- cur = build_norm(cur,
- model.layers[il].attn_sub_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_sub_norm", il);
-
- cur = build_lora_mm(model.layers[il].wo, cur);
- if (model.layers[il].wo_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
- }
- if (model.layers[il].bo) {
- cur = ggml_add(ctx0, cur, model.layers[il].bo);
- }
- cb(cur, "attn_o_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward forward
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
- model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
- NULL, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_sub_out", il);
-
- cur = build_norm(cur,
- model.layers[il].ffn_sub_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_sub_norm", il);
-
- cur = build_lora_mm(model.layers[il].ffn_down, cur);
- if (model.layers[il].ffn_down_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
- }
- cb(cur, "ffn_down", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- // FIXME: do not use model.tok_embd directly, duplicate as model.output
- cur = build_lora_mm(model.tok_embd, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_t5_enc : public llm_graph_context {
- llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
-
- auto * inp_attn = build_attn_inp_no_cache();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm_enc, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
- ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo_enc, nullptr,
- Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm_enc, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = build_ffn(cur,
- model.layers[il].ffn_up_enc, NULL, NULL,
- model.layers[il].ffn_gate_enc, NULL, NULL,
- model.layers[il].ffn_down_enc, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cb(cur, "result_embd", -1);
-
- cur = build_norm(cur,
- model.output_norm_enc, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_t5_dec : public llm_graph_context {
- llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- ggml_tensor * embd_enc = build_inp_cross_embd();
- ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
-
- const int64_t n_outputs_enc = embd_enc->ne[1];
-
- auto * inp_attn_self = build_attn_inp_kv();
- auto * inp_attn_cross = build_attn_inp_cross();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- const int64_t dec_n_layer = hparams.dec_n_layer;
-
- for (int il = 0; il < dec_n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
- ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
-
- cur = build_attn(inp_attn_self,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
- }
-
- cur = ggml_add(ctx0, cur, inpSA);
- cb(cur, "cross_inp", il);
-
- ggml_tensor * inpCA = cur;
-
- // norm
- cur = build_norm(cur,
- model.layers[il].attn_norm_cross, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm_cross", il);
-
- // cross-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
-
- cur = build_attn(inp_attn_cross,
- model.layers[il].wo_cross, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
-
- //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
- //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- //cb(kq, "kq", il);
-
- //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
- //cb(kq, "kq_soft_max_ext", il);
-
- //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
- //cb(v, "v", il);
-
- //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
- //cb(kqv, "kqv", il);
-
- //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- //cb(kqv_merged, "kqv_merged", il);
-
- //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- //cb(cur, "kqv_merged_cont", il);
-
- //ggml_build_forward_expand(gf, cur);
-
- //cur = build_lora_mm(model.layers[il].wo_cross, cur);
- //cb(cur, "kqv_out", il);
- }
-
- if (il == dec_n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
- il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cb(cur, "result_embd", -1);
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_jais : public llm_graph_context {
- llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
-
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_chatglm : public llm_graph_context {
- llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
-
- if (model.layers[il].wqkv == nullptr) {
- Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- }
- Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- }
- Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- } else {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
- }
-
- //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- }
-
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
-
- cur = build_norm(inpL,
- model.output_norm,
- NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_glm4 : public llm_graph_context {
- llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // Pre-attention norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
-
- if (model.layers[il].wqkv == nullptr) {
- Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- }
- Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- }
- Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- } else {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // Post-attention norm (new!)
- cur = build_norm(cur,
- model.layers[il].attn_post_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "post_attn_norm", il);
-
- // Add the input (residual connection after post-attention norm)
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // FF
- {
- // Pre-MLP norm
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // MLP
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- // Post-MLP norm
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "post_mlp_norm", il);
- }
-
- // Add residual connection after post-MLP norm
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
-
- // Final norm
- cur = build_norm(inpL,
- model.output_norm,
- NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // Output projection
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_glm4_moe : public llm_graph_context {
- llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- // Only process up to last layer (skip final NextN layer)
- // Final layer tensors are loaded but not processed in forward pass
- const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
- for (int il = 0; il < n_transformer_layers; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // Pre-attention norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- }
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- }
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- }
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- // Apply Q/K norm if available (GLM-4.5 355B variant)
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- }
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- }
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_transformer_layers - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // Post-attention norm
- cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "post_attn_norm", il);
-
- // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
- if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
- // Dense FFN layer
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // Process routed experts using existing MoE infrastructure
- ggml_tensor * routed_out = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(routed_out, "ffn_moe_out", il);
-
- // Process shared expert on original input
- ggml_tensor * shared_out = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(shared_out, "ffn_shexp_out", il);
-
- // Final output: routed_output + shared_output
- cur = ggml_add(ctx0, routed_out, shared_out);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_nemotron : public llm_graph_context {
- llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- //GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_nemotron_h : public llm_graph_context_mamba {
- llm_build_nemotron_h(
- const llama_model & model,
- const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
-
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
- ggml_build_forward_expand(gf, inpL);
-
- auto * inp = build_inp_mem_hybrid();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- if (hparams.is_recurrent(il)) {
- // ssm layer //
- cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
- } else if (hparams.n_ff(il) == 0) {
- // attention layer //
- cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
- } else {
- cur = build_ffn_layer(cur, model, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // add residual
- cur = ggml_add(ctx0, cur, inpSA);
- cb(cur, "nemotron_h_block_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
- ggml_tensor * build_attention_layer(
- ggml_tensor * cur,
- llm_graph_input_attn_kv * inp_attn,
- const llama_model & model,
- const int64_t n_embd_head,
- const int il) {
-
- // compute Q and K and (optionally) RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- return cur;
- }
-
- ggml_tensor * build_ffn_layer(
- ggml_tensor * cur,
- const llama_model & model,
- const int il) {
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- return cur;
- }
-};
-
-struct llm_build_exaone : public llm_graph_context {
- llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-template <bool iswa>
-struct llm_build_exaone4 : public llm_graph_context {
- llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
- inp_attn_type * inp_attn = nullptr;
-
- if constexpr (iswa) {
- inp_attn = build_attn_inp_kv_iswa();
- } else {
- inp_attn = build_attn_inp_kv();
- }
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // use RoPE for SWA layers or non-SWA models
- const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
-
- cur = inpL;
-
- // self-attention
- {
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- cb(Kcur, "Kcur_normed", il);
-
- if (use_rope) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_ffn(ffn_inp,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_rwkv6_base : public llm_graph_context {
- const llama_model & model;
-
- llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
- }
-
- ggml_tensor * build_rwkv6_channel_mix(
- const llama_layer * layer,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- llm_arch arch) const {
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- switch (arch) {
- case LLM_ARCH_RWKV6:
- {
- ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
- ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
-
- ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
- ggml_tensor * k = ggml_sqr(
- ctx0,
- ggml_relu(
- ctx0,
- build_lora_mm(layer->channel_mix_key, xk)
- )
- );
- cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
- } break;
- default:
- GGML_ABORT("fatal error");
- }
-
- return cur;
- }
-
- ggml_tensor * build_rwkv6_time_mix(
- llm_graph_input_rs * inp,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- const llama_ubatch & ubatch,
- int il) const {
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
-
- const auto n_tokens = ubatch.n_tokens;
- const auto n_seqs = ubatch.n_seqs;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_embd = hparams.n_embd;
- const auto head_size = hparams.wkv_head_size;
- const auto n_head = n_embd / head_size;
- const auto n_head_kv = hparams.n_head_kv(il);
-
- const auto kv_head = mctx_cur->get_head();
-
- const auto & layer = model.layers[il];
-
- bool is_qrwkv = layer.time_mix_first == nullptr;
-
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-
- sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-
- ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
-
- xxx = ggml_reshape_4d(
- ctx0,
- ggml_tanh(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
- ),
- layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
- );
-
- xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
-
- xxx = ggml_mul_mat(
- ctx0,
- ggml_reshape_4d(
- ctx0,
- layer.time_mix_w2,
- layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
- ),
- xxx
- );
-
- ggml_tensor *xw, *xk, *xv, *xr, *xg;
- if (layer.time_mix_lerp_fused) {
- // fusing these weights makes some performance improvement
- sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
- cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
- xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
- xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- } else {
- // for backward compatibility
- xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
- xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
- xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
- xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
- xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
- xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
- }
-
- ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
- ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
- ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
- if (layer.time_mix_receptance_b) {
- r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
- }
- if (layer.time_mix_key_b) {
- k = ggml_add(ctx0, k, layer.time_mix_key_b);
- }
- if (layer.time_mix_value_b) {
- v = ggml_add(ctx0, v, layer.time_mix_value_b);
- }
-
- ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
- if (is_qrwkv) {
- g = ggml_sigmoid(ctx0, g);
- } else {
- g = ggml_silu(ctx0, g);
- }
-
- if (n_head_kv != 0 && n_head_kv != n_head) {
- GGML_ASSERT(n_head % n_head_kv == 0);
- k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
- v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
- ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
- k = ggml_repeat(ctx0, k, tmp);
- v = ggml_repeat(ctx0, v, tmp);
- }
-
- k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
- v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
- r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
-
- ggml_tensor * w = ggml_mul_mat(
- ctx0,
- layer.time_mix_decay_w2,
- ggml_tanh(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
- )
- );
-
- w = ggml_add(ctx0, w, layer.time_mix_decay);
- w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
- w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
-
- if (is_qrwkv) {
- // k = k * (1 - w)
- k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
- }
-
- ggml_tensor * wkv_state = build_rs(
- inp, mctx_cur->get_s_l(il),
- hparams.n_embd_s(), n_seqs);
-
- ggml_tensor * wkv_output;
- if (is_qrwkv) {
- wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
- } else {
- wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
- }
- cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
- wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
- ggml_build_forward_expand(
- gf,
- ggml_cpy(
- ctx0,
- wkv_state,
- ggml_view_1d(
- ctx0,
- mctx_cur->get_s_l(il),
- hparams.n_embd_s() * n_seqs,
- hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
- )
- )
- );
-
- if (!is_qrwkv) {
- // group norm with head_count groups
- cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
- cur = ggml_norm(ctx0, cur, 64e-5f);
-
- // Convert back to regular vectors.
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
- } else {
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- }
-
- cur = ggml_mul(ctx0, cur, g);
- cur = build_lora_mm(layer.time_mix_output, cur);
-
- return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
- }
-};
-
-struct llm_build_rwkv6 : public llm_build_rwkv6_base {
- llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
- GGML_ASSERT(hparams.token_shift_count == 2);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
-
- auto * rs_inp = build_rs_inp();
-
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
-
- ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
- ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
-
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
- cb(att_norm, "attn_norm", il);
-
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- att_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
-
- cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
- cb(ffn_norm, "ffn_norm", il);
-
- x_prev = ggml_concat(
- ctx0,
- ffn_shift,
- ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
- 1
- );
-
- token_shift = ggml_concat(ctx0,
- ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
- ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
- 1
- );
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
-
- ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
- ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
- x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-
- if (il == n_layer - 1 && inp_out_ids) {
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
- x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- }
-
- cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
- cur = ggml_scale(ctx0, cur, 0.5F);
- }
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
- llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
- GGML_ASSERT(n_embd == hparams.n_embd_r());
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * rs_inp = build_rs_inp();
-
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
-
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
- cb(att_norm, "attn_norm", il);
-
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- token_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
-
- cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
-
- token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_rwkv7_base : public llm_graph_context {
- const llama_model & model;
-
- llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
- }
-
- ggml_tensor * build_rwkv7_channel_mix(
- const llama_layer * layer,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- llm_arch arch) const {
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- switch (arch) {
- case LLM_ARCH_RWKV7:
- {
- ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
-
- ggml_tensor * k = ggml_sqr(
- ctx0,
- ggml_relu(
- ctx0,
- build_lora_mm(layer->channel_mix_key, xk)
- )
- );
-
- cur = build_lora_mm(layer->channel_mix_value, k);
- } break;
- default:
- GGML_ABORT("fatal error");
- }
-
- return cur;
- }
-
- ggml_tensor * build_rwkv7_time_mix(
- llm_graph_input_rs * inp,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- ggml_tensor *& first_layer_value,
- const llama_ubatch & ubatch,
- int il) const {
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
-
- const auto n_tokens = ubatch.n_tokens;
- const auto n_seqs = ubatch.n_seqs;
- const auto n_embd = hparams.n_embd;
- const auto head_size = hparams.wkv_head_size;
- const auto head_count = n_embd / head_size;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
-
- const auto kv_head = mctx_cur->get_head();
-
- const auto & layer = model.layers[il];
-
- bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
-
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
- sx = ggml_repeat(ctx0, sx, dummy);
-
- ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
-
- ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
-
- ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
- ggml_tensor * w = ggml_add(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
- layer.time_mix_w0
- );
- w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
-
- ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
- ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
- if (first_layer_value == nullptr) {
- first_layer_value = v;
- } else {
- // Add the first layer value as a residual connection.
- v = ggml_add(ctx0, v,
- ggml_mul(ctx0,
- ggml_sub(ctx0, first_layer_value, v),
- ggml_sigmoid(ctx0, ggml_add(ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
- layer.time_mix_v0
- )
- )
- )
- );
- }
-
- ggml_tensor * g = nullptr;
- if (layer.time_mix_g1 && layer.time_mix_g2) {
- g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
- }
-
- ggml_tensor * a = ggml_sigmoid(ctx0,
- ggml_add(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
- layer.time_mix_a0
- )
- );
-
- ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
- kk = ggml_l2_norm(ctx0, kk, 1e-12);
-
- ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
- k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
-
- r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
- w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
- k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
- v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
- a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
-
- ggml_tensor * wkv_state = build_rs(
- inp, mctx_cur->get_s_l(il),
- hparams.n_embd_s(), n_seqs);
-
- ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
- cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
- wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
- ggml_build_forward_expand(
- gf,
- ggml_cpy(
- ctx0,
- wkv_state,
- ggml_view_1d(
- ctx0,
- mctx_cur->get_s_l(il),
- hparams.n_embd_s() * n_seqs,
- hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
- )
- )
- );
-
- if (layer.time_mix_ln && layer.time_mix_ln_b) {
- // group norm with head_count groups
- cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
- cur = ggml_norm(ctx0, cur, 64e-5f);
-
- // Convert back to regular vectors.
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
- } else {
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- }
-
- ggml_tensor * rk = ggml_sum_rows(ctx0,
- ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
- cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
-
- if (has_gating) {
- cur = ggml_mul(ctx0, cur, g);
- }
- cur = build_lora_mm(layer.time_mix_output, cur);
-
- return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
- }
-};
-
-struct llm_build_rwkv7 : public llm_build_rwkv7_base {
- llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
- GGML_ASSERT(hparams.token_shift_count == 2);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * v_first = nullptr;
-
- inpL = build_inp_embd(model.tok_embd);
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
-
- auto * rs_inp = build_rs_inp();
-
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
-
- ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
- ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
-
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
- cb(att_norm, "attn_norm", il);
-
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- att_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
-
- cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
- cb(ffn_norm, "ffn_norm", il);
-
- x_prev = ggml_concat(
- ctx0,
- ffn_shift,
- ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
- 1
- );
-
- token_shift = ggml_concat(ctx0,
- ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
- ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
- 1
- );
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
-
- ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
- ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
- x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
-
- if (il == n_layer - 1 && inp_out_ids) {
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
- x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
- }
-
- cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-
-struct llm_build_arwkv7 : public llm_build_rwkv7_base {
- llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
- GGML_ASSERT(n_embd == hparams.n_embd_r());
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * v_first = nullptr;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * rs_inp = build_rs_inp();
-
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
-
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
- cb(att_norm, "attn_norm", il);
-
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- token_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
-
- cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
-
- token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_granite : public llm_graph_context {
- llm_build_granite(
- const llama_model & model,
- const llm_graph_params & params)
- : llm_graph_context(params) {
-
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - built only if rope enabled
- ggml_tensor * inp_pos = nullptr;
- if (hparams.rope_finetuned) {
- inp_pos = build_inp_pos();
- }
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- cur = build_attention_layer(
- cur, inp_pos, inp_attn,
- model, n_embd_head, il);
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // ffn
- cur = build_layer_ffn(cur, inpSA, model, il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- // For Granite architectures - scale logits
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
- ggml_tensor * build_attention_layer(
- ggml_tensor * cur,
- ggml_tensor * inp_pos,
- llm_graph_input_attn_kv * inp_attn,
- const llama_model & model,
- const int64_t n_embd_head,
- const int il) {
-
- // compute Q and K and (optionally) RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
-
- const bool use_rope = hparams.rope_finetuned;
- if (use_rope) {
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- return cur;
- }
-
- ggml_tensor * build_layer_ffn(
- ggml_tensor * cur,
- ggml_tensor * inpSA,
- const llama_model & model,
- const int il) {
-
- // For Granite architectures - scale residual
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // For Granite MoE Shared
- if (hparams.n_ff_shexp > 0) {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- } else {
- cur = moe_out;
- }
- }
-
- // For Granite architectures - scale residual
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- return cur;
- }
-};
-
-struct llm_build_granite_hybrid : public llm_graph_context_mamba {
- llm_build_granite_hybrid(
- const llama_model & model,
- const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
-
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- auto * inp = build_inp_mem_hybrid();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- // Positional embeddings populated if rope enabled
- ggml_tensor * inp_pos = nullptr;
- if (hparams.rope_finetuned) {
- inp_pos = build_inp_pos();
- }
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- if (hparams.is_recurrent(il)) {
- // ssm layer //
- cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
- } else {
- // attention layer //
- cur = build_attention_layer(
- cur, inp_pos, inp->get_attn(), model,
- n_embd_head, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- // ffn
- cur = build_layer_ffn(cur, inpSA, model, il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- // For Granite architectures - scale logits
- if (hparams.f_logit_scale) {
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
- ggml_tensor * build_attention_layer(
- ggml_tensor * cur,
- ggml_tensor * inp_pos,
- llm_graph_input_attn_kv * inp_attn,
- const llama_model & model,
- const int64_t n_embd_head,
- const int il) {
-
- // compute Q and K and (optionally) RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
-
- const bool use_rope = hparams.rope_finetuned;
- if (use_rope) {
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- return cur;
- }
-
- ggml_tensor * build_layer_ffn(
- ggml_tensor * cur,
- ggml_tensor * inpSA,
- const llama_model & model,
- const int il) {
-
- // For Granite architectures - scale residual
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // For Granite MoE Shared
- if (hparams.n_ff_shexp > 0) {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- } else {
- cur = moe_out;
- }
- }
-
- // For Granite architectures - scale residual
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- return cur;
- }
-};
-
-// ref: https://github.com/facebookresearch/chameleon
-// based on the original build_llama() function, changes:
-// * qk-norm
-// * swin-norm
-// * removed bias
-// * removed MoE
-struct llm_build_chameleon : public llm_graph_context {
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- if (hparams.swin_norm) {
- cur = inpL;
- } else {
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- if (model.layers[il].attn_q_norm) {
- Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
- ggml_element_size(Qcur) * n_embd_head,
- ggml_element_size(Qcur) * n_embd_head * n_head,
- 0);
- cb(Qcur, "Qcur", il);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
-
- if (model.layers[il].attn_k_norm) {
- Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
- ggml_element_size(Kcur) * n_embd_head,
- ggml_element_size(Kcur) * n_embd_head * n_head_kv,
- 0);
- cb(Kcur, "Kcur", il);
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- if (hparams.swin_norm) {
- cur = build_norm(cur,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- if (!hparams.swin_norm) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- }
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- if (hparams.swin_norm) {
- cur = build_norm(cur,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output_with_img_logits", -1);
-
- // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
- // Needs to be removed once image outputs are supported.
- int img_token_end_idx = 8196;
- int img_token_start_idx = 4;
- int num_img_tokens = img_token_end_idx - img_token_start_idx;
- // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
- // which ensures that text token values are always at least larger than image token values
- ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
- img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
- cb(img_logits, "img_logits", -1);
-
- cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_wavtokenizer_dec : public llm_graph_context {
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
-
- cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
- cur = ggml_add(ctx0, cur, model.conv1d_b);
-
- // posnet
- for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
- const auto & layer = model.layers[il].posnet;
-
- inpL = cur;
-
- switch (il) {
- case 0:
- case 1:
- case 3:
- case 4:
- {
- cur = build_norm(cur,
- layer.norm1,
- layer.norm1_b,
- LLM_NORM_GROUP, 0);
-
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
- cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.conv1_b);
-
- cur = build_norm(cur,
- layer.norm2,
- layer.norm2_b,
- LLM_NORM_GROUP, 0);
-
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
- cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.conv2_b);
-
- cur = ggml_add(ctx0, cur, inpL);
- } break;
- case 2:
- {
- cur = build_norm(cur,
- layer.attn_norm,
- layer.attn_norm_b,
- LLM_NORM_GROUP, 0);
-
- ggml_tensor * q;
- ggml_tensor * k;
- ggml_tensor * v;
-
- q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
- k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
- v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
-
- q = ggml_add(ctx0, q, layer.attn_q_b);
- k = ggml_add(ctx0, k, layer.attn_k_b);
- v = ggml_add(ctx0, v, layer.attn_v_b);
-
- q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
- k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
-
- ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-
- kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
-
- cur = ggml_mul_mat(ctx0, kq, v);
-
- cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.attn_o_b);
-
- cur = ggml_add(ctx0, cur, inpL);
- } break;
- case 5:
- {
- cur = build_norm(cur,
- layer.norm,
- layer.norm_b,
- LLM_NORM_GROUP, 0);
- } break;
- default: GGML_ABORT("unknown posnet layer");
- };
- }
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
- cur = build_norm(cur,
- model.tok_norm,
- model.tok_norm_b,
- LLM_NORM, -1);
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
- inpL = cur;
-
- // convnext
- for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
- const auto & layer = model.layers[il].convnext;
-
- cur = inpL;
-
- cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.dw_b);
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
- cur = build_norm(cur,
- layer.norm,
- layer.norm_b,
- LLM_NORM, -1);
-
- cur = build_ffn(cur,
- layer.pw1, layer.pw1_b, NULL,
- NULL, NULL, NULL,
- layer.pw2, layer.pw2_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
-
- cur = ggml_mul(ctx0, cur, layer.gamma);
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
- inpL = ggml_add(ctx0, cur, inpL);
- }
-
- cur = inpL;
-
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cur = ggml_add(ctx0, cur, model.output_b);
-
- cb(cur, "result_embd", -1);
- res->t_embd = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_plm : public llm_graph_context {
- llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
-
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- ggml_tensor * q = NULL;
- q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- cb(q, "q", il);
-
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- 0);
- cb(q_nope, "q_nope", il);
-
- // and {n_head * n_embd_head_qk_rope, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
-
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
-
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
- kv_pe_compresseed->nb[1],
- 0);
- cb(kv_compressed, "kv_compressed", il);
-
- // and {n_embd_head_qk_rope, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
- kv_pe_compresseed->nb[1],
- kv_pe_compresseed->nb[1],
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
-
- kv_compressed = build_norm(kv_compressed,
- model.layers[il].attn_kv_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(kv_compressed, "kv_compressed", il);
-
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
- cb(kv, "kv", il);
-
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- 0);
- cb(k_nope, "k_nope", il);
-
- // and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
- cb(v_states, "v_states", il);
-
- v_states = ggml_cont(ctx0, v_states);
- cb(v_states, "v_states", il);
-
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
- 0);
- cb(v_states, "v_states", il);
-
- q_pe = ggml_rope_ext(
- ctx0, q_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
-
- // shared RoPE key
- k_pe = ggml_rope_ext(
- ctx0, k_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
-
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
- cb(q_states, "q_states", il);
-
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
- cb(k_states, "k_states", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_bailingmoe : public llm_graph_context {
- llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- false, hparams.expert_weights_scale,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_bailingmoe2 : public llm_graph_context {
- llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
- for (int il = 0; il < n_transformer_layers; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_transformer_layers - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
- cb(sa_out, "sa_out", il);
-
- // MoE branch
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
-
- cur = ggml_add(ctx0, cur, sa_out);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_dots1 : public llm_graph_context {
- llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_ernie4_5 : public llm_graph_context {
- llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- {
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
-
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_ernie4_5_moe : public llm_graph_context {
- llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- {
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
-
- if (!is_moe_layer) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * moe_out = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
-
- // Shared expert (if present)
- if (hparams.n_ff_shexp > 0) {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
-
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- } else {
- cur = moe_out;
- }
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_falcon_h1 : public llm_graph_context_mamba {
- llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- // Build the inputs in the recurrent & kv cache
- auto * inp = build_inp_mem_hybrid();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur-post-rope", il);
- cb(Kcur, "Kcur-post-rope", il);
- cb(Vcur, "Vcur-post-rope", il);
-
- ggml_tensor * attn_out = build_attn(inp->get_attn(),
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(attn_out, "attn_out", il);
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- // Mamba2 layer
- cb(cur, "ssm_in", il);
-
- ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
- cb(ssm_out, "ssm_out", il);
-
- // // Aggregation
- cur = ggml_add(ctx0, attn_out, ssm_out);
- inpSA = ggml_add(ctx0, cur, inpSA);
- cb(cur, "layer_out", il);
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = inpSA;
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, inpSA);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_plamo2 : public llm_graph_context_mamba {
- llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
- cb(inpL, "embedding_output", -1);
-
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_hybrid = build_inp_mem_hybrid();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * residual = inpL;
-
- // ggml_graph_add_node(gf, model.layers[il].attn_norm);
- // cb(model.layers[il].attn_norm, "attn_norm", il);
-
- // pre_mixer_norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-
- // check if this layer is Mamba or Attention
- bool is_mamba_layer = hparams.is_recurrent(il);
-
- if (is_mamba_layer) {
- // PLaMo-2 Mamba layer
- cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
- } else {
- // PLaMo-2 Attention layer
- cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
- }
-
- // post_mixer_norm
- cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- // residual connection
- cur = ggml_add(ctx0, cur, residual);
- cb(cur, "attn_residual", il);
- residual = cur;
-
- // pre-ffn norm
- cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_pre_norm", il);
-
- // feed-forward network
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- // post ffn norm
- cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_post_norm", il);
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- }
-
- // residual connection
- cur = ggml_add(ctx0, cur, residual);
- cb(cur, "ffn_residual", il);
-
- inpL = cur;
- }
-
- cur = inpL;
-
- // final norm
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
-
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
-
- // Explicitly mark as output tensor to ensure proper backend assignment
- ggml_set_output(cur);
-
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
-private:
- ggml_tensor * build_plamo2_attn_layer(
- llm_graph_input_attn_kv * inp,
- ggml_tensor * inp_pos,
- ggml_tensor * cur,
- const llama_model & model,
- int il) {
-
- // self-attention
- {
- // PLaMo-2 uses combined QKV tensor
- ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
- cb(qkv, "wqkv", il);
-
- // split QKV tensor into Q, K, V
- const int64_t n_embd_head_q = hparams.n_embd_head_k;
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
- int32_t n_head = hparams.n_head(il);
- int32_t n_head_kv = hparams.n_head_kv(il);
-
- const int64_t q_offset = 0;
- const int64_t k_offset = n_embd_head_q * n_head;
- const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
-
- ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
- ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cur = build_attn(inp,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
- }
-
- cb(cur, "attn_out", il);
-
- return cur;
- }
-
- ggml_tensor * build_plamo2_mamba_layer(
- llm_graph_input_rs * inp,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) {
-
- const auto * mctx_cur = inp->mctx;
-
- const auto kv_head = mctx_cur->get_head();
-
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t n_heads = hparams.ssm_dt_rank;
- const int64_t head_dim = d_inner / n_heads;
- const int64_t n_group = hparams.ssm_n_group;
- const int64_t n_seqs = ubatch.n_seqs;
-
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
- ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
- ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
- cb(zx, "mamba_in_proj", il);
- // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
- zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
- zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
- cb(zx, "mamba_in_proj_out", il);
-
- // split into z and x
- // => {head_dim * n_heads, n_seq_tokens, n_seqs}
- ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
- x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
- // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
- cb(x, "mamba_x_split", il);
-
- ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
- cb(z, "mamba_z_split", il);
-
- // conv1d
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
- cb(conv_x, "mamba_conv1d_input", il);
-
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
- conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all,
- (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
- kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
- cb(conv_states_all, "mamba_conv1d_state", il);
-
- // 1D convolution
- x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
- cb(x, "mamba_conv1d", il);
-
- x = ggml_silu(ctx0, x);
- cb(x, "mamba_conv1d_silu", il);
- }
-
- // SSM
- {
- // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
- ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
- cb(x_bcdt, "mamba_bcdt_proj", il);
-
- // split into dt, B, C
- const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
- ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
- ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
- ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
- cb(B, "mamba_B_raw", il);
- cb(C, "mamba_C_raw", il);
- cb(dt, "mamba_dt_raw", il);
-
- // Apply RMS norm to dt, B, C (PLaMo-2 specific)
- B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
- C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
- dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
- cb(B, "mamba_B_normed", il);
- cb(C, "mamba_C_normed", il);
- cb(dt, "mamba_dt_normed", il);
-
- // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
- dt = build_lora_mm(model.layers[il].ssm_dt, dt);
- dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
- cb(dt, "mamba_dt_proj", il);
-
- ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
- cb(A, "mamba_A", il);
-
- x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
- B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
- C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
-
- // use the states and the indices provided by build_recurrent_state
- // (this is necessary in order to properly use the states before they are overwritten,
- // while avoiding to make unnecessary copies of the states)
- auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
-
- // Custom operator to optimize the parallel associative scan
- // as described in the Annex D of the Mamba paper.
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
- };
-
- ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
- cb(y_ssm, "mamba_ssm_scan", il);
-
- // store last states
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0,
- ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
- ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
- cb(ssm_states_all, "mamba_ssm_states", il);
-
- ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
- cb(y, "mamba_y_view", il);
-
- // Add D parameter and apply gating with z
- // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
- cb(y, "mamba_y_add_d", il);
-
- y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
- cb(y, "mamba_y_swiglu_z", il);
-
- // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
- cur = build_lora_mm(model.layers[il].ssm_out, y);
- cb(cur, "mamba_out_proj", il);
- }
-
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
- cb(cur, "mamba_out", il);
-
- return cur;
- }
-};
-
-struct llm_build_arcee : public llm_graph_context {
- llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- // ARCEE uses relu^2 instead of silu
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_hunyuan_moe : public llm_graph_context {
- llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_norm", il);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_norm", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // feed-forward network (non-MoE)
- ggml_tensor * cur_mlp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur_mlp, "ffn_mlp", il);
-
- // MoE branch
- ggml_tensor * cur_moe = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU,
- true, // norm_topk_prob
- false,
- 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur_moe, "ffn_moe_out", il);
-
- ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
- cb(ffn_out, "ffn_out", il);
-
- cur = ggml_add(ctx0, ffn_out, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_hunyuan_dense : public llm_graph_context {
- llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_norm", il);
-
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_norm", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // feed-forward network (non-MoE)
- ggml_tensor * cur_mlp = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur_mlp, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur_mlp, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_smollm3 : public llm_graph_context {
- llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (use_rope) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_openai_moe_iswa : public llm_graph_context {
- llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv_iswa();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
-
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = ffn_inp;
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
- model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
- model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SWIGLU_OAI_MOE, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_lfm2 : public llm_graph_context {
- const llama_model & model;
-
- llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
-
- ggml_tensor * cur = build_inp_embd(model.tok_embd);
- cb(cur, "model.embed_tokens", -1);
-
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_hybrid = build_inp_mem_hybrid();
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
-
- auto * prev_cur = cur;
- cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "model.layers.{}.operator_norm", il);
-
- cur = hparams.is_recurrent(il) ?
- build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
- build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
- }
-
- cur = ggml_add(ctx0, prev_cur, cur);
-
- auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
-
- ggml_tensor * ffn_out = is_moe_layer ?
- build_moe_feed_forward(ffn_norm_out, il) :
- build_dense_feed_forward(ffn_norm_out, il);
- cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_out);
- }
-
- cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
- cb(cur, "model.embedding_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
- cb(cur, "lm_head", -1);
-
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-
- ggml_tensor * build_moe_feed_forward(ggml_tensor * cur,
- int il) const {
- return build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
- il);
- }
-
- ggml_tensor * build_dense_feed_forward(ggml_tensor * cur,
- int il) const {
- GGML_ASSERT(!model.layers[il].ffn_up_b);
- GGML_ASSERT(!model.layers[il].ffn_gate_b);
- GGML_ASSERT(!model.layers[il].ffn_down_b);
- return build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- }
-
- ggml_tensor * build_attn_block(ggml_tensor * cur,
- ggml_tensor * inp_pos,
- llm_graph_input_attn_kv * inp_attn,
- int il) const {
- GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
- auto const n_embd_head = hparams.n_embd_head_v;
- auto const n_head_kv = hparams.n_head_kv(il);
-
- auto * q = build_lora_mm(model.layers[il].wq, cur);
- cb(q, "model.layers.{}.self_attn.q_proj", il);
- auto * k = build_lora_mm(model.layers[il].wk, cur);
- cb(k, "model.layers.{}.self_attn.k_proj", il);
- auto * v = build_lora_mm(model.layers[il].wv, cur);
- cb(v, "model.layers.{}.self_attn.v_proj", il);
-
- q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
- k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
- v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
-
- // qk norm
- q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(q, "model.layers.{}.self_attn.q_layernorm", il);
- k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(k, "model.layers.{}.self_attn.k_layernorm", il);
-
- // RoPE
- q = ggml_rope_ext(
- ctx0, q, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- k = ggml_rope_ext(
- ctx0, k, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cur = build_attn(inp_attn, model.layers[il].wo, NULL,
- q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
- cb(cur, "model.layers.{}.self_attn.out_proj", il);
-
- return cur;
- }
-
- ggml_tensor * build_shortconv_block(ggml_tensor * cur,
- llm_graph_input_rs * inp_recr,
- int il) {
- const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
- const uint32_t kv_head = mctx_cur->get_head();
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
- const int64_t n_seqs = ubatch.n_seqs;
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
- const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
- cb(bcx, "model.layers.{}.conv.in_proj", il);
-
- constexpr auto n_chunks = 3;
- GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
- auto const chunk_size = bcx->ne[0] / n_chunks;
- auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
- auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
- auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
-
- auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
-
- // read conv state
- auto * conv_state = mctx_cur->get_r_l(il);
- auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
- auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
-
- bx = ggml_concat(ctx0, conv, bx, 0);
- GGML_ASSERT(bx->ne[0] > conv->ne[0]);
-
- // last d_conv columns is a new conv state
- auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
- GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
-
- // write new conv conv state
- ggml_build_forward_expand(
- gf,
- ggml_cpy(
- ctx0,
- new_conv,
- ggml_view_1d(
- ctx0,
- conv_state,
- ggml_nelements(new_conv),
- kv_head*d_conv*n_embd*ggml_element_size(new_conv)
- )
- )
- );
-
- auto * conv_kernel = model.layers[il].shortconv.conv;
- auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
- cb(conv_out, "model.layers.{}.conv.conv", il);
-
- auto * y = ggml_mul(ctx0, c, conv_out);
- y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
- cb(y, "model.layers.{}.conv.out_proj", il);
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
-
- return y;
- }
-};
-
-struct llm_build_seed_oss : public llm_graph_context {
- llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
-
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-template <bool iswa>
-struct llm_build_smallthinker : public llm_graph_context{
- llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
- inp_attn_type * inp_attn = nullptr;
-
- if constexpr (iswa) {
- inp_attn = build_attn_inp_kv_iswa();
- } else {
- inp_attn = build_attn_inp_kv();
- }
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- ggml_tensor * probs = nullptr;
-
- probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
- cb(probs, "ffn_moe_logits", il);
-
- // norm
- cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
-
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
- }
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- probs = ggml_get_rows(ctx0, probs, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * ffn_out =
- build_moe_ffn(cur,
- nullptr,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_RELU, true,
- false, 0.0,
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
- il, probs);
-
- cb(ffn_out, "ffn_out", il);
- cur = ffn_out;
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_grovemoe : public llm_graph_context {
- llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
- cb(probs, "ffn_moe_logits", il);
-
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- nullptr,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il, probs);
- cb(moe_out, "ffn_moe_out", il);
- cur = moe_out;
-
- // TODO: Only do the expert selection and weights once
- moe_out =
- build_moe_ffn(cur,
- nullptr,
- model.layers[il].ffn_up_chexps,
- model.layers[il].ffn_gate_chexps,
- model.layers[il].ffn_down_chexps,
- nullptr,
- n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il, probs);
- cb(moe_out, "ffn_adj_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
- cb(cur, "ffn_final_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_apertus : public llm_graph_context {
- llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv();
-
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = build_norm(inpL,
- model.layers[il].attn_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // self-attention
- {
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur_pos", il);
- cb(Kcur, "Kcur_pos", il);
- cb(Vcur, "Vcur_pos", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network with xIELU activation
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- // Up projection
- ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
- cb(up, "ffn_up", il);
-
- float alpha_n_val = hparams.xielu_alpha_n[il];
- float alpha_p_val = hparams.xielu_alpha_p[il];
- float beta_val = hparams.xielu_beta[il];
- float eps_val = hparams.xielu_eps[il];
-
- // Apply xIELU activation
- ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
- cb(activated, "ffn_xielu", il);
-
- // Down projection
- cur = build_lora_mm(model.layers[il].ffn_down, activated);
- cb(cur, "ffn_down", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, nullptr,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_minimax_m2 : public llm_graph_context {
- llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
-
- ggml_tensor * cur;
- ggml_tensor * inpL;
-
- inpL = build_inp_embd(model.tok_embd);
-
- ggml_tensor * inp_pos = build_inp_pos();
- auto inp_attn = build_attn_inp_kv();
- ggml_tensor * inp_out_ids = build_inp_out_ids();
-
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
-
- cur = inpL;
-
- // self_attention
- {
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
-
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
-
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
-
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
-
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
-
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
-
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
-
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
-
- if (il == n_layer - 1 && inp_out_ids) {
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(cur, "ffn_moe_out", il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
-
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
-
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- // lm_head
- cur = build_lora_mm(model.output, cur);
-
- cb(cur, "result_output", -1);
- res->t_logits = cur;
-
- ggml_build_forward_expand(gf, cur);
- }
-};
-
-struct llm_build_cogvlm : public llm_graph_context {
- llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- float kq_scale = 1.0f / sqrtf(float(n_embd_head));
-
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
-
- ggml_tensor * inpL, * cur;
- inpL = build_inp_embd(model.tok_embd);
-
- ggml_tensor * inp_pos = build_inp_pos();
-
- auto * inp_attn = build_attn_inp_kv();
-
- // check ubatch to see if we have input tokens (text)
- // or an input embedding vector (image)
- bool is_text;
- if (ubatch.token) {
- is_text = true;
- } else {
- is_text = false;
- }
-
- for (int il = 0; il < n_layer; ++il) {
- // get either the text or image weight tensors
- ggml_tensor * wqkv, * wo;
- ggml_tensor * ffn_gate, * ffn_down, * ffn_up;
-
- if (is_text) {
- wqkv = model.layers[il].wqkv;
- wo = model.layers[il].wo;
- ffn_gate = model.layers[il].ffn_gate;
- ffn_down = model.layers[il].ffn_down;
- ffn_up = model.layers[il].ffn_up;
- } else {
- wqkv = model.layers[il].visexp_attn_wqkv;
- wo = model.layers[il].visexp_attn_wo;
- ffn_gate = model.layers[il].visexp_ffn_gate;
- ffn_down = model.layers[il].visexp_ffn_down;
- ffn_up = model.layers[il].visexp_ffn_up;
- }
-
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-
- // build self attention
- {
- ggml_tensor * qkv = build_lora_mm(wqkv, cur);
-
- // split qkv into Q, K, V along the first dimension
- ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
- qkv->nb[1], 0);
- ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
- qkv->nb[1], n_embd * ggml_element_size(qkv));
- ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
- qkv->nb[1], 2 * n_embd * ggml_element_size(qkv));
-
- Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type);
- Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
-
- cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
-
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
-
- cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
-
- cur = build_ffn(cur,
- ffn_up, NULL, NULL,
- ffn_gate, NULL, NULL,
- ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
-
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
-
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
-
- }
-};
-
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
llama_memory_i * res;
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+ cb(Vcur, "Vcur_pos", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network with xIELU activation
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // Up projection
+ ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
+ cb(up, "ffn_up", il);
+
+ float alpha_n_val = hparams.xielu_alpha_n[il];
+ float alpha_p_val = hparams.xielu_alpha_p[il];
+ float beta_val = hparams.xielu_beta[il];
+ float eps_val = hparams.xielu_eps[il];
+
+ // Apply xIELU activation
+ ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
+ cb(activated, "ffn_xielu", il);
+
+ // Down projection
+ cur = build_lora_mm(model.layers[il].ffn_down, activated);
+ cb(cur, "ffn_down", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ // ARCEE uses relu^2 instead of silu
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
+ cb(ffn_out, "ffn_out", il);
+
+ // MoE
+ cur = build_norm(inpSA,
+ model.layers[il].ffn_norm_exps, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm_exps", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_out);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_arwkv7::llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * v_first = nullptr;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0,
+ token_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+ 1
+ );
+
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
+
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ switch (model.type) {
+ case LLM_TYPE_7B:
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ break;
+ case LLM_TYPE_13B:
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ false, hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 0 * sizeof(float) * (n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
+ cb(sa_out, "sa_out", il);
+
+ // MoE branch
+ cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = nullptr;
+
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
+ inp_pos = build_inp_pos();
+ }
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+
+ // token types are hardcoded to zero ("Sentence A")
+ if (model.type_embd) {
+ ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+ inpL = ggml_add(ctx0, inpL, type_row0);
+ }
+ if (model.arch == LLM_ARCH_BERT) {
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+ }
+ cb(inpL, "inp_embd", -1);
+
+ // embed layer norm
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * cur = inpL;
+
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
+
+ // self-attention
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
+ 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ } else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ }
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head * n_head_kv, n_tokens);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ // RoPE
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
+ model.arch == LLM_ARCH_JINA_BERT_V3) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // re-add the layer input
+ cur = ggml_add(ctx0, cur, inpL);
+
+ // attention layer norm
+ cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
+
+ if (model.layers[il].attn_norm_2 != nullptr) {
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
+ cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
+ }
+
+ ggml_tensor * ffn_inp = cur;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
+ // MoE branch
+ cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
+ model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
+ LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cb(cur, "ffn_moe_out", il);
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
+ model.arch == LLM_ARCH_JINA_BERT_V3) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // output layer norm
+ cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].wq_scale) {
+ Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+ }
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ // B1.K
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].wk_scale) {
+ Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+ }
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ // B1.V
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].wv_scale) {
+ Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+ }
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ NULL, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ cur = build_norm(cur,
+ model.layers[il].attn_sub_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_sub_norm", il);
+
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ if (model.layers[il].wo_scale) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+ }
+ if (model.layers[il].bo) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
+ }
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward forward
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+ NULL, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_sub_out", il);
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_sub_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_sub_norm", il);
+
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
+ if (model.layers[il].ffn_down_scale) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+ }
+ cb(cur, "ffn_down", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ // FIXME: do not use model.tok_embd directly, duplicate as model.output
+ cur = build_lora_mm(model.tok_embd, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ inpL = build_norm(inpL,
+ model.tok_norm,
+ model.tok_norm_b,
+ LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+#include <float.h>
+
+llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ if (hparams.swin_norm) {
+ cur = inpL;
+ } else {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur) * n_embd_head,
+ ggml_element_size(Qcur) * n_embd_head * n_head,
+ 0);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm,
+ model.layers[il].attn_q_norm_b,
+ LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+ ggml_element_size(Kcur) * n_embd_head,
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+ 0);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm,
+ model.layers[il].attn_k_norm_b,
+ LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ if (hparams.swin_norm) {
+ cur = build_norm(cur,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ if (!hparams.swin_norm) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ }
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ if (hparams.swin_norm) {
+ cur = build_norm(cur,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output_with_img_logits", -1);
+
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+ // Needs to be removed once image outputs are supported.
+ int img_token_end_idx = 8196;
+ int img_token_start_idx = 4;
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+ // which ensures that text token values are always at least larger than image token values
+ ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+ img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+ cb(img_logits, "img_logits", -1);
+
+ cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv == nullptr) {
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ } else {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+ }
+
+ //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ }
+
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor *inpL, *cur;
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ // check ubatch to see if we have input tokens (text)
+ // or an input embedding vector (image)
+ bool is_text;
+ if (ubatch.token) {
+ is_text = true;
+ } else {
+ is_text = false;
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ // get either the text or image weight tensors
+ ggml_tensor *wqkv, *wo;
+ ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
+
+ if (is_text) {
+ wqkv = model.layers[il].wqkv;
+ wo = model.layers[il].wo;
+ ffn_gate = model.layers[il].ffn_gate;
+ ffn_down = model.layers[il].ffn_down;
+ ffn_up = model.layers[il].ffn_up;
+ } else {
+ wqkv = model.layers[il].visexp_attn_wqkv;
+ wo = model.layers[il].visexp_attn_wo;
+ ffn_gate = model.layers[il].visexp_ffn_gate;
+ ffn_down = model.layers[il].visexp_ffn_down;
+ ffn_up = model.layers[il].visexp_ffn_up;
+ }
+
+ ggml_tensor * inpSA = inpL;
+ cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+
+ // build self attention
+ {
+ ggml_tensor * qkv = build_lora_mm(wqkv, cur);
+
+ // split qkv into Q, K, V along the first dimension
+ ggml_tensor * Qcur =
+ ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), qkv->nb[1], 0);
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ qkv->nb[1], n_embd * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ qkv->nb[1], 2 * n_embd * ggml_element_size(qkv));
+
+ Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type);
+ Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
+
+ cur = build_attn(inp_attn,
+ wo, nullptr,
+ Qcur, Kcur, Vcur,
+ nullptr, nullptr, nullptr,
+ kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ ffn_up, NULL, NULL,
+ ffn_gate, NULL, NULL,
+ ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ const float f_logit_scale = hparams.f_logit_scale;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const bool is_swa = hparams.is_swa(il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+ ggml_tensor * ffn_inp = cur;
+
+ // self-attention
+ {
+ // rope freq factors for 128k context
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (is_swa) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+
+ ggml_tensor * attn_out = cur;
+
+ // feed-forward network
+ {
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ // add together residual + FFN + self-attention
+ cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
+ }
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ const float f_logit_scale = hparams.f_logit_scale;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * ffn_inp = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+ ggml_tensor * attn_out = cur;
+
+ // feed-forward network
+ {
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ // add together residual + FFN + self-attention
+ cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(cur, "wqkv_clamped", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].attn_out_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_out_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_ff = hparams.n_ff(il);
+
+ if (n_head == 0) {
+ // attention-free layer of Llama-3_1-Nemotron-51B
+ cur = inpL;
+ } else {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ if (n_head > 0 && n_head_kv == 0) {
+ // "linear attention" of Llama-3_1-Nemotron-51B
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "wo", il);
+ } else if (n_head > 0) {
+ // self-attention
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
+ if (n_ff == 0) {
+ continue;
+ }
+ // modified to support attention-free layer of Llama-3_1-Nemotron-51B
+ ggml_tensor * ffn_inp = cur;
+ if (n_head > 0) {
+ ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+ }
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ bool is_lite = (hparams.n_layer == 27);
+
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
+
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+ const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+ if (!is_lite) {
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+ cb(q, "q", il);
+
+ q = build_norm(q, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+ cb(q, "q", il);
+
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+ cb(q, "q", il);
+ } else {
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(q, "q", il);
+ }
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * q_nope =
+ ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(
+ ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_cmpr =
+ ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+ cb(kv_cmpr, "kv_cmpr", il);
+
+ // and {n_embd_head_qk_rope, 1, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(q_pe, "q_pe", il);
+
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(k_pe, "k_pe", il);
+
+ kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+ cb(kv_cmpr, "kv_cmpr", il);
+
+ if (is_mla) {
+ // {n_embd_head_qk_nope, n_tokens, n_head}
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+ cb(q_nope, "q_nope_perm", il);
+
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+ // {kv_lora_rank, n_head, n_tokens}
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
+ cb(Qcur, "Qcur", il);
+
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
+ cb(Kcur, "Kcur", il);
+
+ // {kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Vcur = kv_cmpr;
+ cb(Vcur, "Vcur", il);
+
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
+ } else {
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
+ cb(kv, "kv", il);
+
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * k_nope =
+ ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, 0);
+ cb(k_nope, "k_nope_view", il);
+
+ // and {n_embd_head_v, n_head, n_tokens}
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
+ cb(Vcur, "Vcur_view", il);
+
+ Vcur = ggml_cont(ctx0, Vcur);
+ cb(Vcur, "Vcur_cont", il);
+
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
+ cb(Kcur, "Kcur", il);
+
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ //copied from qwen2
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+ // norm
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ bool is_moe_layer =
+ static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
+
+ if (!is_moe_layer) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // Shared expert (if present)
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ } else {
+ cur = moe_out;
+ }
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+template <bool iswa>
+llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // use RoPE for SWA layers or non-SWA models
+ const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
+
+ cur = inpL;
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_exaone4<false>;
+template struct llm_build_exaone4<true>;
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // Build the inputs in the recurrent & kv cache
+ auto * inp = build_inp_mem_hybrid();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur-post-rope", il);
+ cb(Kcur, "Kcur-post-rope", il);
+ cb(Vcur, "Vcur-post-rope", il);
+
+ ggml_tensor * attn_out = build_attn(inp->get_attn(),
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(attn_out, "attn_out", il);
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ // Mamba2 layer
+ cb(cur, "ssm_in", il);
+
+ ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ cb(ssm_out, "ssm_out", il);
+
+ // // Aggregation
+ cur = ggml_add(ctx0, attn_out, ssm_out);
+ inpSA = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "layer_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = inpSA;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, inpSA);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * attn_norm;
+
+ attn_norm = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(attn_norm, "attn_norm", il);
+
+ // self-attention
+ {
+ if (model.layers[il].attn_norm_2) {
+ // Falcon-40B
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm_2,
+ model.layers[il].attn_norm_2_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm_2", il);
+ } else {
+ cur = attn_norm;
+ }
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ // using mode = 2 for neox mode
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = cur;
+
+ // feed forward
+ {
+ cur = build_ffn(attn_norm, // !! use the attn norm, not the result
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // norm
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ if (ubatch.token) {
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur =
+ build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // final logit soft-capping
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ if (ubatch.token) {
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+ }
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // TODO: is causal == true correct? might need some changes
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model),
+ n_embd_head(model.hparams.n_embd_head_k),
+ n_embd_altup(model.hparams.n_embd_altup),
+ n_altup(model.hparams.n_altup),
+ i_altup_act(model.hparams.i_altup_act) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ if (ubatch.token) {
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+ }
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // TODO: is causal == true correct? might need some changes
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
+ ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
+
+ // inpL now has only 1 altup, project it to the rest of the altups
+ // these "added" altups will be concat to the last dim of inpL
+ {
+ ggml_tensor * target_magnitude = calc_magnitude(inpL);
+ ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
+ ggml_tensor * altup_added =
+ ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
+ ggml_tensor * new_magnitude = calc_magnitude(altup_added);
+ altup_added = ggml_div(ctx0, ggml_mul(ctx0, altup_added, target_magnitude), new_magnitude);
+ inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
+ cb(inpL, "inp_stacked", -1);
+ }
+ // inpL now has shape: [n_embd, n_tokens, n_altup]
+ // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
+
+ for (int il = 0; il < n_layer; ++il) {
+ // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
+ ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
+
+ // predicted value will go through self-attention and laurel
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
+ cur = active_prediction;
+ cb(cur, "active_prediction", il);
+
+ // norm
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // laurel
+ ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
+
+ // self-attention
+ if (hparams.has_kv(il)) {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
+
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ cb(Vcur, "Vcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+
+ cur = build_attn(inp_attn, model.layers[il].wo,
+ NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+ hparams.f_attention_scale, il);
+ } else {
+ // reuse KV cache of earlier layers
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur_pos", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
+ }
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
+ cb(cur, "attn_gated", il);
+
+ ggml_tensor * attn_laurel = ggml_scale(ctx0, ggml_add(ctx0, cur, laurel_out),
+ 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
+ cb(attn_laurel, "attn_laurel", il);
+
+ cur = build_norm(attn_laurel, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
+ ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
+
+ if (il < n_layer_sparsity) {
+ // apply activation sparsity
+ gate_proj = gaussian_topk(gate_proj);
+ }
+ gate_proj = ggml_gelu(ctx0, gate_proj);
+
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", il);
+
+ ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
+ cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
+
+ ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
+
+ ggml_tensor * first_prediction; // [n_embd, n_tokens]
+ {
+ first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
+ first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
+ first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
+ first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
+ cb(first_prediction, "first_prediction_gated", il);
+ ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
+ first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
+ cb(first_prediction, "first_prediction_scaled", il);
+
+ first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
+ first_prediction =
+ build_norm(first_prediction, model.layers[il].per_layer_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(first_prediction, "first_prediction_out", il);
+ }
+ // equivalent to python code: corrected_predictions[1:] += first_prediction
+ {
+ ggml_tensor * slice_first = view_2d_slice(corrected, 0);
+ ggml_tensor * slice_rest = ggml_view_3d(
+ ctx0, corrected, n_embd, n_tokens, n_altup - 1, ggml_row_size(corrected->type, n_embd),
+ ggml_row_size(corrected->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(corrected));
+ ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
+ corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
+ }
+ cur = corrected; // [n_embd, n_tokens, n_altup]
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL; // [n_embd, n_tokens, n_altup]
+
+ // cur now has multiple altup(s), we want to merge them back to 1 altup
+ {
+ ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
+ // do a view to skip the first slice (active altup)
+ ggml_tensor * alt_slice =
+ ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, ggml_row_size(cur->type, n_embd),
+ ggml_row_size(cur->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(cur));
+ ggml_tensor * altup_unembd =
+ ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
+ ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
+ altup_unembd = ggml_div(ctx0, ggml_mul(ctx0, altup_unembd, target_magnitude), new_magnitude);
+ cb(altup_unembd, "altup_unembd", -1);
+
+ // equivalent to torch.mean(hidden_states, dim=0)
+ cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
+ for (int i = 0; i < n_altup - 1; ++i) {
+ cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
+ }
+ cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
+ cb(cur, "unembd_merged", -1);
+ }
+ // cur now has shape: [n_embd, n_tokens]
+
+ // TODO: move this to right after the last KV layer
+ {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ {
+ // final logit soft-capping
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
+ return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
+}
+
+// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
+ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
+ GGML_ASSERT(idx < (int) x->ne[2]);
+ return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
+ idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
+}
+
+// equivalent to get_per_layer_inputs() in python code
+// output shape: [n_embd_altup, n_layer, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
+ auto inp = std::make_unique<llm_graph_input_embd>();
+ ggml_tensor * inp_per_layer;
+ if (ubatch.token) {
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+ ggml_set_input(inp->tokens);
+ res->t_tokens = inp->tokens;
+ inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
+ cb(inp_per_layer, "inp_per_layer_selected", -1);
+ } else {
+ GGML_ABORT("TODO: support embd input");
+ }
+ res->add_input(std::move(inp));
+ return inp_per_layer;
+}
+
+// equivalent to project_per_layer_inputs() in python code
+// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
+// output shape: [n_embd_altup, n_tokens, n_layer]
+ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
+ const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
+ const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
+
+ ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
+ per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
+ per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
+ per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, NULL, LLM_NORM_RMS,
+ -1); // [n_embd_altup, n_layer, n_tokens]
+ cb(per_layer_proj, "per_layer_proj", -1);
+
+ inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
+ cb(inp_per_layer, "inp_per_layer", -1);
+
+ // permute to shape: [n_embd_altup, n_tokens, n_layer]
+ inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
+ return inp_per_layer;
+}
+
+// input cur shape: [n_altup, n_tokens]
+// output shape: [n_altup, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::laurel(ggml_tensor * cur, int il) {
+ ggml_tensor * tmp = cur;
+ tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
+ tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
+ tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
+ tmp = ggml_add(ctx0, tmp, cur);
+ cb(tmp, "laurel_out", il);
+ return tmp;
+}
+
+// input x shape: [n_embd, n_tokens]
+// output shape: [n_embd, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::gaussian_topk(ggml_tensor * x) {
+ ggml_tensor * mean = ggml_mean(ctx0, x);
+ ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
+ 1.0f / (float) (x->ne[0] - 1)));
+ ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
+ return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
+}
+
+//
+// altup functions
+//
+
+// equivalent to compute_router_modalities() in python code
+// input x shape: [n_embd, n_tokens]
+// output shape: [n_altup, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::altup_compute_router_modalities(ggml_tensor * x, int il) {
+ ggml_tensor * router_inputs = build_norm(x, model.layers[il].altup_router_norm, NULL, LLM_NORM_RMS, il);
+
+ // router_input_scale
+ router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float) n_embd);
+
+ ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
+ return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
+}
+
+// input cur shape: [n_embd, n_tokens, n_altup]
+// output shape: [n_embd, n_tokens, n_altup]
+ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
+ ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+ cb(modalities, "modalities", il);
+
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
+ cb(all_coefs, "all_coefs", il);
+ // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
+
+ // permute to [n_altup, n_embd, n_tokens]
+ ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+ ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
+
+ // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
+ predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
+ predictions = ggml_add(ctx0, predictions, cur);
+ cb(predictions, "predictions", il);
+
+ return predictions;
+}
+
+// input predictions shape: [n_embd, n_tokens, n_altup]
+// input activated shape: [n_embd, n_tokens]
+// output shape: [n_embd, n_tokens, n_altup]
+ggml_tensor * llm_build_gemma3n_iswa::altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+ cb(modalities, "modalities", il);
+
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
+ ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
+ cb(innovation, "innovation", il);
+
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
+ all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
+ cb(all_coefs, "all_coefs", il);
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
+
+ innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
+ ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
+ corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
+ cb(corrected, "corrected", il);
+
+ return corrected;
+}
--- /dev/null
+#include "models.h"
+
+llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ // Only process up to last layer (skip final NextN layer)
+ // Final layer tensors are loaded but not processed in forward pass
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // Pre-attention norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // Post-attention norm
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_attn_norm", il);
+
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
+ // Dense FFN layer
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // Process routed experts using existing MoE infrastructure
+ ggml_tensor * routed_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(routed_out, "ffn_moe_out", il);
+
+ // Process shared expert on original input
+ ggml_tensor * shared_out = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(shared_out, "ffn_shexp_out", il);
+
+ // Final output: routed_output + shared_output
+ cur = ggml_add(ctx0, routed_out, shared_out);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // Pre-attention norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv == nullptr) {
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ } else {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
+ 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ }
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // Post-attention norm (new!)
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_attn_norm", il);
+
+ // Add the input (residual connection after post-attention norm)
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ // Pre-MLP norm
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MLP
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // Post-MLP norm
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_mlp_norm", il);
+ }
+ // Add residual connection after post-MLP norm
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+ // Final norm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // Output projection
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * pos;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // ffn
+ if (hparams.use_par_res) {
+ // attention and ffn are computed in parallel
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+ ggml_tensor * attn_out = cur;
+
+ cur = build_norm(inpL,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ } else {
+ // attention and ffn are computed sequentially
+ // x = x + attn(ln1(x))
+ // x = x + ffn(ln2(x))
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ // Positional embeddings populated if rope enabled
+ ggml_tensor * inp_pos = nullptr;
+ if (hparams.rope_finetuned) {
+ inp_pos = build_inp_pos();
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.is_recurrent(il)) {
+ // ssm layer //
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ } else {
+ // attention layer //
+ cur = build_attention_layer(cur, inp_pos, inp->get_attn(), model, n_embd_head, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // ffn
+ cur = build_layer_ffn(cur, inpSA, model, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // For Granite architectures - scale logits
+ if (hparams.f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+ // compute Q and K and (optionally) RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ const bool use_rope = hparams.rope_finetuned;
+ if (use_rope) {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il) {
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ }
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_granite::llm_build_granite(
+ const llama_model & model,
+ const llm_graph_params & params)
+ : llm_graph_context(params) {
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - built only if rope enabled
+ ggml_tensor * inp_pos = nullptr;
+ if (hparams.rope_finetuned) {
+ inp_pos = build_inp_pos();
+ }
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ cur = build_attention_layer(
+ cur, inp_pos, inp_attn,
+ model, n_embd_head, il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // ffn
+ cur = build_layer_ffn(cur, inpSA, model, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // For Granite architectures - scale logits
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_granite::build_attention_layer(
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+
+ // compute Q and K and (optionally) RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ const bool use_rope = hparams.rope_finetuned;
+ if (use_rope) {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_granite::build_layer_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il) {
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ }
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
--- /dev/null
+#include "models.h"
+
+llm_graph_context_mamba::llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
+
+ggml_tensor * llm_graph_context_mamba::build_mamba_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+ const int64_t n_head = d_inner;
+ const int64_t head_dim = 1;
+ const int64_t n_seqs = ubatch.n_seqs;
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
+ // split the above in two
+ // => {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+ ggml_tensor * z =
+ ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner * ggml_element_size(xz));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
+ n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all, (d_conv - 1) * (d_inner) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner) *ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
+
+ // bias
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
+
+ x = ggml_silu(ctx0, x);
+ }
+
+ // ssm
+ {
+ // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
+ // split
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+ ggml_tensor * B =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * dt_rank);
+ ggml_tensor * C =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * (dt_rank + d_state));
+
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+ dt = build_lora_mm(layer.ssm_dt, dt);
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
+
+ cur = x;
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
+
+ ggml_tensor * A = layer.ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // Custom operator to optimize the parallel associative scan
+ // as described in the Annex D of the Mamba paper.
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, x->nb[3] * x->ne[3]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(layer.ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_head = hparams.ssm_dt_rank;
+ const int64_t head_dim = d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
+
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
+
+ // split the above in three
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
+ zxBCdt->nb[1], zxBCdt->nb[2], 0);
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2 * n_group * d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1],
+ zxBCdt->nb[2], d_inner * ggml_element_size(zxBCdt));
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2],
+ (2 * d_inner + 2 * n_group * d_state) * ggml_element_size(zxBCdt));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs,
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all,
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
+ ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+
+ // bias
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
+
+ xBC = ggml_silu(ctx0, xBC);
+ }
+
+ // ssm
+ {
+ // These correspond to V K Q in SSM/attention duality
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], 0);
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], d_inner * ggml_element_size(xBC));
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], (d_inner + n_group * d_state) * ggml_element_size(xBC));
+
+ // {n_head, n_seq_tokens, n_seqs}
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
+
+ ggml_tensor * A = model.layers[il].ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // TODO: use semistructured matrices to implement state-space duality
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, ggml_nelements(x) * x->nb[0]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head * x->nb[1],
+ n_seq_tokens * n_head * x->nb[1], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+ cb(y, "mamba2_y_add_d", il);
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // grouped RMS norm
+ if (model.layers[il].ssm_norm) {
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ cb(cur, "mamba_out", il);
+
+ return cur;
+}
--- /dev/null
+#include "models.h"
+
+llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_out_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_out_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_GELU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ if (model.layers[il].ffn_up) {
+ ggml_tensor * ffn_out = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(ffn_out, "ffn_out", il);
+
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
+
+ // final logit soft-capping
+ if (hparams.f_final_logit_softcapping) {
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
+ cb(probs, "ffn_moe_logits", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il,
+ probs);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ // TODO: Only do the expert selection and weights once
+ moe_out = build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_chexps,
+ model.layers[il].ffn_gate_chexps,
+ model.layers[il].ffn_down_chexps,
+ nullptr,
+ n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il,
+ probs);
+ cb(moe_out, "ffn_adj_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
+ cb(cur, "ffn_final_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_norm", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_norm", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ // feed-forward network (non-MoE)
+ ggml_tensor * cur_mlp = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_mlp, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_norm", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_norm", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network (non-MoE)
+ ggml_tensor * cur_mlp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_mlp, "ffn_mlp", il);
+
+ // MoE branch
+ ggml_tensor * cur_moe = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU,
+ true, // norm_topk_prob
+ false,
+ 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur_moe, "ffn_moe_out", il);
+
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
+ cb(ffn_out, "ffn_out", il);
+
+ cur = ggml_add(ctx0, ffn_out, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_hybrid = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (n_head_kv == 0) {
+ cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
+ } else {
+ // Attention
+
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // No RoPE :)
+ cur = build_attn(inp_hybrid->get_attn(),
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // residual
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
+ cb(cur, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // FFN
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ // residual
+ cur = ggml_add(ctx0, ffn_inp, cur);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ // final rmsnorm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+#include "../llama-memory-hybrid.h"
+
+
+llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {
+ ggml_tensor * cur = build_inp_embd(model.tok_embd);
+ cb(cur, "model.embed_tokens", -1);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_hybrid = build_inp_mem_hybrid();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
+
+ auto * prev_cur = cur;
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "model.layers.{}.operator_norm", il);
+
+ cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
+ build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
+ }
+
+ cur = ggml_add(ctx0, prev_cur, cur);
+
+ auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
+
+ ggml_tensor * ffn_out =
+ is_moe_layer ? build_moe_feed_forward(ffn_norm_out, il) : build_dense_feed_forward(ffn_norm_out, il);
+ cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_out);
+ }
+
+ cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "model.embedding_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "lm_head", -1);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
+ return build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+}
+
+ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
+ return build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+}
+
+ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ int il) const {
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
+ const auto n_embd_head = hparams.n_embd_head_v;
+ const auto n_head_kv = hparams.n_head_kv(il);
+
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
+
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
+
+ // qk norm
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
+
+ // RoPE
+ q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+ k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
+ const uint32_t kv_head = mctx_cur->get_head();
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
+ const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
+
+ constexpr auto n_chunks = 3;
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
+ const auto chunk_size = bcx->ne[0] / n_chunks;
+ auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 0 * chunk_size * ggml_element_size(bcx));
+ auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 1 * chunk_size * ggml_element_size(bcx));
+ auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 2 * chunk_size * ggml_element_size(bcx));
+
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
+
+ // read conv state
+ auto * conv_state = mctx_cur->get_r_l(il);
+ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
+ auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
+
+ bx = ggml_concat(ctx0, conv, bx, 0);
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
+
+ // last d_conv columns is a new conv state
+ auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
+ (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
+
+ // write new conv conv state
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
+ ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
+ kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
+
+ auto * conv_kernel = model.layers[il].shortconv.conv;
+ auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
+ cb(conv_out, "model.layers.{}.conv.conv", il);
+
+ auto * y = ggml_mul(ctx0, c, conv_out);
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
+ cb(y, "model.layers.{}.conv.out_proj", il);
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
+
+ return y;
+}
--- /dev/null
+#include "models.h"
+
+llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
--- /dev/null
+#include "models.h"
+
+
+llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // Non-causal attention for diffusion
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // temperature tuning
+ ggml_tensor * inp_attn_scale = nullptr;
+ inp_attn_scale = build_inp_attn_scale();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ } else if (inp_attn_scale) {
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (use_rope && hparams.use_kq_norm) {
+ // Llama4TextL2Norm
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+ il);
+
+ // Shared experts
+ ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(shexp_out, "ffn_moe_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, shexp_out);
+ cb(cur, "ffn_moe_out_merged", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (hparams.use_kq_norm) {
+ // Llama4TextL2Norm
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
--- /dev/null
+#include "models.h"
+
+
+llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (model.arch == LLM_ARCH_MAMBA2) {
+ cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
+ } else {
+ cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ // final rmsnorm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
--- /dev/null
+#include "models.h"
+
+
+llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ //TODO: if the model varies, these parameters need to be read from the model
+ const int64_t n_embd_base = 256;
+ const float scale_embd = 12.0f;
+ const float scale_depth = 1.4f;
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // scale the input embeddings
+ inpL = ggml_scale(ctx0, inpL, scale_embd);
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+ cb(q, "q", il);
+
+ q = build_norm(q,
+ model.layers[il].attn_q_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(q, "q", il);
+
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+ cb(q, "q", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+ kv_pe_compresseed->nb[1],
+ 0);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // and {n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+ kv_pe_compresseed->nb[1],
+ kv_pe_compresseed->nb[1],
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ kv_compressed = build_norm(kv_compressed,
+ model.layers[il].attn_kv_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+ cb(kv, "kv", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ 0);
+ cb(k_nope, "k_nope", il);
+
+ // and {n_head * n_embd_head_v, n_tokens}
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_cont(ctx0, v_states);
+ cb(v_states, "v_states", il);
+
+ q_pe = ggml_rope_ext(
+ ctx0, q_pe, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(q_pe, "q_pe", il);
+
+ // shared RoPE key
+ k_pe = ggml_rope_ext(
+ ctx0, k_pe, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(k_pe, "k_pe", il);
+
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+ cb(q_states, "q_states", il);
+
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+ cb(k_states, "k_states", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // scale_res - scale the hidden states for residual connection
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
+ cur = ggml_scale(ctx0, cur, scale_res);
+ cb(cur, "hidden_scaled", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ // scale the hidden states for residual connection
+ cur = ggml_scale(ctx0, cur, scale_res);
+ cb(cur, "hidden_scaled_ffn", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head scaling
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
+ cur = ggml_scale(ctx0, cur, scale_lmhead);
+ cb(cur, "lmhead_scaling", -1);
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+
+#include "models.h"
+
+llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto inp_attn = build_attn_inp_kv();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = inpL;
+
+ // self_attention
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#pragma once
+
+#include "../llama-model.h"
+#include "../llama-graph.h"
+#include "../llama-memory-recurrent.h"
+
+#include <cmath>
+
+struct llm_graph_context_mamba : public llm_graph_context {
+ llm_graph_context_mamba(const llm_graph_params & params);
+
+ virtual ~llm_graph_context_mamba() = default;
+
+ ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
+ ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const;
+
+};
+
+// Base class for RWKV-related models
+struct llm_build_rwkv6_base : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params);
+
+ virtual ~llm_build_rwkv6_base() = default;
+
+ ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const;
+
+ ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ const llama_ubatch & ubatch,
+ int il) const;
+};
+
+// Base class for RWKV7-related models
+struct llm_build_rwkv7_base : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params);
+
+ virtual ~llm_build_rwkv7_base() = default;
+
+ // RWKV7-specific graph building methods
+ ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const;
+ ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ ggml_tensor *& first_layer_value,
+ const llama_ubatch & ubatch,
+ int il) const;
+};
+
+struct llm_build_apertus : public llm_graph_context {
+ llm_build_apertus(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arcee : public llm_graph_context {
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arctic : public llm_graph_context {
+ llm_build_arctic(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arwkv7 : public llm_build_rwkv7_base {
+ llm_build_arwkv7(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_baichuan : public llm_graph_context {
+ llm_build_baichuan(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bailingmoe2 : public llm_graph_context {
+ llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bailingmoe : public llm_graph_context {
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bert : public llm_graph_context {
+ llm_build_bert(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bitnet : public llm_graph_context {
+ llm_build_bitnet(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bloom : public llm_graph_context {
+ llm_build_bloom(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_chameleon : public llm_graph_context {
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_chatglm : public llm_graph_context {
+ llm_build_chatglm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_codeshell : public llm_graph_context {
+ llm_build_codeshell(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_cogvlm : public llm_graph_context {
+ llm_build_cogvlm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_cohere2_iswa : public llm_graph_context {
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_command_r : public llm_graph_context {
+ llm_build_command_r(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dbrx : public llm_graph_context {
+ llm_build_dbrx(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deci : public llm_graph_context {
+ llm_build_deci(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deepseek2 : public llm_graph_context {
+ llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deepseek : public llm_graph_context {
+ llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dots1 : public llm_graph_context {
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dream : public llm_graph_context {
+ llm_build_dream(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_ernie4_5 : public llm_graph_context {
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_ernie4_5_moe : public llm_graph_context {
+ llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_exaone4 : public llm_graph_context {
+ llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_exaone : public llm_graph_context {
+ llm_build_exaone(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_falcon : public llm_graph_context {
+ llm_build_falcon(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_falcon_h1 : public llm_graph_context_mamba {
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma2_iswa : public llm_graph_context {
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma3_iswa : public llm_graph_context {
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma3n_iswa : public llm_graph_context {
+ const llama_model & model;
+
+ const int64_t n_embd_head;
+ const int64_t n_embd_altup;
+ const int64_t n_altup;
+ const int i_altup_act;
+ const int n_layer_sparsity = 10; // number of layers using activation sparsity
+ const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
+
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * calc_magnitude(ggml_tensor * x);
+ ggml_tensor * view_2d_slice(ggml_tensor * x, int idx);
+ ggml_tensor * get_per_layer_inputs();
+ ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer);
+ ggml_tensor * gaussian_topk(ggml_tensor * x);
+ ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il);
+ ggml_tensor * altup_predict(ggml_tensor * cur, int il);
+ ggml_tensor * laurel(ggml_tensor * cur, int il);
+ ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
+};
+
+struct llm_build_gemma_embedding : public llm_graph_context {
+ llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma : public llm_graph_context {
+ llm_build_gemma(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_glm4 : public llm_graph_context {
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_glm4_moe : public llm_graph_context {
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gpt2 : public llm_graph_context {
+ llm_build_gpt2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gptneox : public llm_graph_context {
+ llm_build_gptneox(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_granite : public llm_graph_context {
+ llm_build_granite(const llama_model & model, const llm_graph_params & params);
+
+private:
+ ggml_tensor * build_attention_layer(
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il);
+
+ ggml_tensor * build_layer_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il);
+};
+
+struct llm_build_granite_hybrid : public llm_graph_context_mamba {
+ llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
+ ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,const int64_t n_embd_head, const int il);
+};
+
+struct llm_build_grok : public llm_graph_context {
+ llm_build_grok(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_grovemoe : public llm_graph_context {
+ llm_build_grovemoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_hunyuan_dense : public llm_graph_context {
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_hunyuan_moe : public llm_graph_context {
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_internlm2 : public llm_graph_context {
+ llm_build_internlm2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_jais : public llm_graph_context {
+ llm_build_jais(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_jamba : public llm_graph_context_mamba {
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_lfm2 : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
+ ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
+ ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
+ ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
+
+};
+
+struct llm_build_llada : public llm_graph_context {
+ llm_build_llada(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_llada_moe : public llm_graph_context {
+ llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_llama : public llm_graph_context {
+ llm_build_llama(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_llama_iswa : public llm_graph_context {
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mamba : public llm_graph_context_mamba {
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_minicpm3 : public llm_graph_context {
+ llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_minimax_m2 : public llm_graph_context {
+ llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mpt : public llm_graph_context {
+ llm_build_mpt(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_nemotron : public llm_graph_context {
+ llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_nemotron_h : public llm_graph_context_mamba {
+ llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il);
+ ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model, const int64_t n_embd_head, const int il);
+};
+
+struct llm_build_neo_bert : public llm_graph_context {
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_olmo2 : public llm_graph_context {
+ llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_olmoe : public llm_graph_context {
+ llm_build_olmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_olmo : public llm_graph_context {
+ llm_build_olmo(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_openai_moe_iswa : public llm_graph_context {
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_openelm : public llm_graph_context {
+ llm_build_openelm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_orion : public llm_graph_context {
+ llm_build_orion(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_phi2 : public llm_graph_context {
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params);
+};
+
+template<bool iswa>
+struct llm_build_phi3 : public llm_graph_context {
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_plamo2 : public llm_graph_context_mamba {
+ llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
+ private:
+ ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
+ ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur,
+ const llama_model & model, int il);
+};
+
+struct llm_build_plamo : public llm_graph_context {
+ llm_build_plamo(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_plm : public llm_graph_context {
+ llm_build_plm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2 : public llm_graph_context {
+ llm_build_qwen2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2moe : public llm_graph_context {
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2vl : public llm_graph_context {
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3 : public llm_graph_context {
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3moe : public llm_graph_context {
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3vl : public llm_graph_context {
+ llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3vlmoe : public llm_graph_context {
+ llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+
+struct llm_build_qwen : public llm_graph_context {
+ llm_build_qwen(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_refact : public llm_graph_context {
+ llm_build_refact(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv6 : public llm_build_rwkv6_base {
+ llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
+ llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv7 : public llm_build_rwkv7_base {
+ llm_build_rwkv7(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_seed_oss : public llm_graph_context {
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_smallthinker : public llm_graph_context {
+ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_smollm3 : public llm_graph_context {
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_stablelm : public llm_graph_context {
+ llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_starcoder2 : public llm_graph_context {
+ llm_build_starcoder2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_starcoder : public llm_graph_context {
+ llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_t5_dec : public llm_graph_context {
+ llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_t5_enc : public llm_graph_context {
+ llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_wavtokenizer_dec : public llm_graph_context {
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_xverse : public llm_graph_context {
+ llm_build_xverse(const llama_model & model, const llm_graph_params & params);
+};
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * pos;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ if (model.pos_embd) {
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+ }
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * attn_norm;
+
+ attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, il);
+ cb(attn_norm, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = attn_norm;
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+
+ if (hparams.f_clamp_kqv > 0.0f) {
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(cur, "wqkv_clamped", il);
+ }
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 0 * sizeof(float) * (n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+
+ // Q/K Layernorm
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head * n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed forward
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+
+llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ ggml_build_forward_expand(gf, inpL);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.is_recurrent(il)) {
+ // ssm layer //
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ } else if (hparams.n_ff(il) == 0) {
+ // attention layer //
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
+ } else {
+ cur = build_ffn_layer(cur, model, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // add residual
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "nemotron_h_block_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor * cur,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+ // compute Q and K and (optionally) RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
--- /dev/null
+#include "models.h"
+
+llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ //GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "inp_embd", -1);
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * cur = inpL;
+
+ // pre-norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
+
+ // self-attention
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ // RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // re-add the layer input
+ cur = ggml_add(ctx0, cur, inpL);
+
+ ggml_tensor * ffn_inp = cur;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // pre-norm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up,
+ NULL, NULL, NULL, NULL, NULL,
+ model.layers[il].ffn_down,
+ NULL, NULL, NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ NULL, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ NULL, NULL,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ NULL, NULL,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+template <bool iswa>
+llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = inpL;
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ const bool is_swa = hparams.is_swa(il);
+
+ if (is_swa) {
+ // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
+ // This is achieved here by setting freq_scale and attn_factor to 1.
+ // We also set ext_factor to 0 to avoid a few unnecessary computations.
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+ 0.0, 1.0, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+ 0.0, 1.0, beta_fast, beta_slow
+ );
+ } else {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+
+// Explicit template instantiations
+template struct llm_build_olmo2<false>;
+template struct llm_build_olmo2<true>;
--- /dev/null
+#include "models.h"
+
+llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ffn_inp;
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SWIGLU_OAI_MOE, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_head_qkv = 2*n_head_kv + n_head;
+
+ cur = inpL;
+ ggml_tensor * residual = cur;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur", il);
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, NULL,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, NULL,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Qcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // norm
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ // if (model.layers[il].bq) {
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ // cb(Qcur, "Qcur", il);
+ // }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ // if (model.layers[il].bk) {
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ // cb(Kcur, "Kcur", il);
+ // }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ // if (model.layers[il].bv) {
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ // cb(Vcur, "Vcur", il);
+ // }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * attn_norm_output;
+ ggml_tensor * ffn_output;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ attn_norm_output = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(attn_norm_output, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+ } else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // with phi2, we scale the Q to avoid precision issues
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
+ }
+ // FF
+ {
+ ffn_output = build_ffn(attn_norm_output,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(ffn_output, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_output);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output_no_bias", -1);
+
+ cur = ggml_add(ctx0, cur, model.output_b);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+template<bool iswa>
+llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ auto * residual = inpL;
+
+ // self-attention
+ {
+ // rope freq factors for 128k context
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ ggml_tensor* attn_norm_output = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM_RMS, il);
+ cb(attn_norm_output, "attn_norm", il);
+
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+ cb(cur, "wqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ }
+ else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+ cb(Qcur, "Qcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+ cur = ggml_add(ctx0, cur, residual);
+ residual = cur;
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ cur = ggml_add(ctx0, residual, cur);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cb(cur, "result_output_no_bias", -1);
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+
+// Explicit template instantiations
+template struct llm_build_phi3<false>;
+template struct llm_build_phi3<true>;
--- /dev/null
+#include "models.h"
+
+llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * sa_inp = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ ggml_tensor * sa_out = cur;
+
+ cur = sa_inp;
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, sa_out);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "embedding_output", -1);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_hybrid = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * residual = inpL;
+
+ // ggml_graph_add_node(gf, model.layers[il].attn_norm);
+ // cb(model.layers[il].attn_norm, "attn_norm", il);
+
+ // pre_mixer_norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+
+ // check if this layer is Mamba or Attention
+ bool is_mamba_layer = hparams.is_recurrent(il);
+
+ if (is_mamba_layer) {
+ // PLaMo-2 Mamba layer
+ cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
+ } else {
+ // PLaMo-2 Attention layer
+ cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
+ }
+
+ // post_mixer_norm
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ // residual connection
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "attn_residual", il);
+ residual = cur;
+
+ // pre-ffn norm
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_pre_norm", il);
+
+ // feed-forward network
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // post ffn norm
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+
+ // residual connection
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "ffn_residual", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // final norm
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+
+ // Explicitly mark as output tensor to ensure proper backend assignment
+ ggml_set_output(cur);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv * inp,
+ ggml_tensor * inp_pos,
+ ggml_tensor * cur,
+ const llama_model & model,
+ int il) {
+ // self-attention
+ {
+ // PLaMo-2 uses combined QKV tensor
+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(qkv, "wqkv", il);
+
+ // split QKV tensor into Q, K, V
+ const int64_t n_embd_head_q = hparams.n_embd_head_k;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ int32_t n_head = hparams.n_head(il);
+ int32_t n_head_kv = hparams.n_head_kv(il);
+
+ const int64_t q_offset = 0;
+ const int64_t k_offset = n_embd_head_q * n_head;
+ const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float),
+ qkv->nb[1], q_offset * ggml_element_size(qkv));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float),
+ qkv->nb[1], k_offset * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float),
+ qkv->nb[1], v_offset * ggml_element_size(qkv));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cur = build_attn(inp,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f / sqrtf(float(n_embd_head_v)), il);
+ }
+
+ cb(cur, "attn_out", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_heads = hparams.ssm_dt_rank;
+ const int64_t head_dim = d_inner / n_heads;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
+ cb(zx, "mamba_in_proj", il);
+ // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
+ zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
+ cb(zx, "mamba_in_proj_out", il);
+
+ // split into z and x
+ // => {head_dim * n_heads, n_seq_tokens, n_seqs}
+ ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3],
+ head_dim * ggml_element_size(zx));
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
+ // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
+ cb(x, "mamba_x_split", il);
+
+ ggml_tensor * z =
+ ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
+ cb(z, "mamba_z_split", il);
+
+ // conv1d
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+ cb(conv_x, "mamba_conv1d_input", il);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
+ n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all,
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
+ ggml_element_size(conv_states_all))));
+ cb(conv_states_all, "mamba_conv1d_state", il);
+
+ // 1D convolution
+ x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+ cb(x, "mamba_conv1d", il);
+
+ x = ggml_silu(ctx0, x);
+ cb(x, "mamba_conv1d_silu", il);
+ }
+
+ // SSM
+ {
+ // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+ ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
+ cb(x_bcdt, "mamba_bcdt_proj", il);
+
+ // split into dt, B, C
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
+ ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
+ ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2],
+ ggml_element_size(x_bcdt) * d_state);
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2],
+ ggml_element_size(x_bcdt) * (2 * d_state));
+ cb(B, "mamba_B_raw", il);
+ cb(C, "mamba_C_raw", il);
+ cb(dt, "mamba_dt_raw", il);
+
+ // Apply RMS norm to dt, B, C (PLaMo-2 specific)
+ B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
+ C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
+ dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
+ cb(B, "mamba_B_normed", il);
+ cb(C, "mamba_C_normed", il);
+ cb(dt, "mamba_dt_normed", il);
+
+ // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+ dt = build_lora_mm(model.layers[il].ssm_dt, dt);
+ dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
+ cb(dt, "mamba_dt_proj", il);
+
+ ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
+ cb(A, "mamba_A", il);
+
+ x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x),
+ head_dim * n_heads * ggml_element_size(x),
+ head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
+ B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
+ C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
+
+ // Custom operator to optimize the parallel associative scan
+ // as described in the Annex D of the Mamba paper.
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+ cb(y_ssm, "mamba_ssm_scan", il);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(
+ ctx0,
+ ggml_view_1d(ctx0, y_ssm, n_heads * head_dim * d_state * n_seqs,
+ n_heads * head_dim * n_seq_tokens * n_seqs * ggml_element_size(y_ssm)),
+ ggml_view_1d(ctx0, ssm_states_all, n_heads * head_dim * d_state * n_seqs,
+ kv_head * n_seqs * n_heads * head_dim * d_state * ggml_element_size(ssm_states_all))));
+ cb(ssm_states_all, "mamba_ssm_states", il);
+
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs,
+ head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x),
+ head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
+ cb(y, "mamba_y_view", il);
+
+ // Add D parameter and apply gating with z
+ // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
+ cb(y, "mamba_y_add_d", il);
+
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+ cb(y, "mamba_y_swiglu_z", il);
+
+ // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
+ cb(cur, "mamba_out_proj", il);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ cb(cur, "mamba_out", il);
+
+ return cur;
+}
--- /dev/null
+#include "models.h"
+
+llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(q, "q", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+ kv_pe_compresseed->nb[1],
+ 0);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // and {n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+ kv_pe_compresseed->nb[1],
+ kv_pe_compresseed->nb[1],
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ kv_compressed = build_norm(kv_compressed,
+ model.layers[il].attn_kv_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+ cb(kv, "kv", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ 0);
+ cb(k_nope, "k_nope", il);
+
+ // and {n_head * n_embd_head_v, n_tokens}
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_cont(ctx0, v_states);
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+ 0);
+ cb(v_states, "v_states", il);
+
+ q_pe = ggml_rope_ext(
+ ctx0, q_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(q_pe, "q_pe", il);
+
+ // shared RoPE key
+ k_pe = ggml_rope_ext(
+ ctx0, k_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(k_pe, "k_pe", il);
+
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+ cb(q_states, "q_states", il);
+
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+ cb(k_states, "k_states", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
+
+ // using mode = 2 for neox mode
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward forward
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+
+llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
+
+ // sigmoid
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
+ cb(cur_gate, "ffn_shexp_gate", il);
+
+ ggml_tensor * cur_ffn = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_ffn, "ffn_shexp", il);
+
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
+
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
+ cb(moe_out, "ffn_out", il);
+
+ cur = moe_out;
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
+ const size_t n_deepstack_layers = hparams.n_deepstack_layers;
+ const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
+
+ if (ubatch.embd) {
+ // Image input: split main embd and deepstack embds
+ ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
+ for (size_t i = 0; i < n_deepstack_layers; i++) {
+ deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
+ }
+ inpL = inpL_main;
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ if (ubatch.embd && (size_t)il < n_deepstack_layers) {
+ cur = ggml_add(ctx0, cur, deepstack_features[il]);
+ cb(cur, "deepstack_out", il);
+ }
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
--- /dev/null
+#include "models.h"
+
+llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+
+ const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
+ const size_t n_deepstack_layers = hparams.n_deepstack_layers;
+ const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
+
+ if (ubatch.embd) {
+ // Image input: split main embd and deepstack embds
+ ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
+ for (size_t i = 0; i < n_deepstack_layers; i++) {
+ deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
+ }
+ inpL = inpL_main;
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ if (ubatch.embd && (size_t)il < n_deepstack_layers) {
+ cur = ggml_add(ctx0, cur, deepstack_features[il]);
+ cb(cur, "deepstack_out", il);
+ }
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_rwkv6_base::llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {}
+
+ggml_tensor * llm_build_rwkv6_base::build_rwkv6_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const {
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ switch (arch) {
+ case LLM_ARCH_RWKV6:
+ {
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+ ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+ ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+ ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
+ cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+ }
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ return cur;
+}
+
+ggml_tensor * llm_build_rwkv6_base::build_rwkv6_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_embd = hparams.n_embd;
+ const auto head_size = hparams.wkv_head_size;
+ const auto n_head = n_embd / head_size;
+ const auto n_head_kv = hparams.n_head_kv(il);
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ bool is_qrwkv = layer.time_mix_first == nullptr;
+
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+
+ sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
+
+ xxx = ggml_reshape_4d(ctx0, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)),
+ layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens);
+
+ xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+ xxx = ggml_mul_mat(
+ ctx0, ggml_reshape_4d(ctx0, layer.time_mix_w2, layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5), xxx);
+
+ ggml_tensor *xw, *xk, *xv, *xr, *xg;
+ if (layer.time_mix_lerp_fused) {
+ // fusing these weights makes some performance improvement
+ sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+ xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
+ xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+ } else {
+ // for backward compatibility
+ xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+ xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
+ xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
+ xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
+ xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
+ xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
+ }
+ ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
+ ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
+ ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
+ if (layer.time_mix_receptance_b) {
+ r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
+ }
+ if (layer.time_mix_key_b) {
+ k = ggml_add(ctx0, k, layer.time_mix_key_b);
+ }
+ if (layer.time_mix_value_b) {
+ v = ggml_add(ctx0, v, layer.time_mix_value_b);
+ }
+ ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
+ if (is_qrwkv) {
+ g = ggml_sigmoid(ctx0, g);
+ } else {
+ g = ggml_silu(ctx0, g);
+ }
+ if (n_head_kv != 0 && n_head_kv != n_head) {
+ GGML_ASSERT(n_head % n_head_kv == 0);
+ k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
+ v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
+ ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
+ k = ggml_repeat(ctx0, k, tmp);
+ v = ggml_repeat(ctx0, v, tmp);
+ }
+ k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
+ r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
+
+ ggml_tensor * w =
+ ggml_mul_mat(ctx0, layer.time_mix_decay_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)));
+
+ w = ggml_add(ctx0, w, layer.time_mix_decay);
+ w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+ w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
+
+ if (is_qrwkv) {
+ // k = k * (1 - w)
+ k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+ }
+ ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
+
+ ggml_tensor * wkv_output;
+ if (is_qrwkv) {
+ wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
+ } else {
+ wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
+ }
+ cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+ wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, wkv_state,
+ ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
+
+ if (!is_qrwkv) {
+ // group norm with head_count groups
+ cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
+ cur = ggml_norm(ctx0, cur, 64e-5f);
+
+ // Convert back to regular vectors.
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
+ } else {
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ }
+ cur = ggml_mul(ctx0, cur, g);
+ cur = build_lora_mm(layer.time_mix_output, cur);
+
+ return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_rwkv6::llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) :
+ llm_build_rwkv6_base(model, params) {
+ GGML_ASSERT(hparams.token_shift_count == 2);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_shift =
+ ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+ ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
+ token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0, att_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
+
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+ cb(ffn_norm, "ffn_norm", il);
+
+ x_prev = ggml_concat(
+ ctx0, ffn_shift,
+ ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
+
+ token_shift = ggml_concat(ctx0,
+ ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
+ ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
+ 1);
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+ cur = ggml_scale(ctx0, cur, 0.5F);
+ }
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_rwkv6qwen2::llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0,
+ token_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+ 1
+ );
+
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
+
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_rwkv7_base::llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {}
+
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const {
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ switch (arch) {
+ case LLM_ARCH_RWKV7:
+ {
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+
+ ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
+
+ cur = build_lora_mm(layer->channel_mix_value, k);
+ }
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ return cur;
+}
+
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ ggml_tensor *& first_layer_value,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+ const auto n_embd = hparams.n_embd;
+ const auto head_size = hparams.wkv_head_size;
+ const auto head_count = n_embd / head_size;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
+
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
+ sx = ggml_repeat(ctx0, sx, dummy);
+
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
+
+ ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+ ggml_tensor * xg =
+ has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) :
+ nullptr;
+
+ ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
+ ggml_tensor * w = ggml_add(
+ ctx0, ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
+ layer.time_mix_w0);
+ w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
+
+ ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
+ ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
+ if (first_layer_value == nullptr) {
+ first_layer_value = v;
+ } else {
+ // Add the first layer value as a residual connection.
+ v = ggml_add(ctx0, v,
+ ggml_mul(ctx0, ggml_sub(ctx0, first_layer_value, v),
+ ggml_sigmoid(ctx0, ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.time_mix_v2,
+ ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
+ layer.time_mix_v0))));
+ }
+ ggml_tensor * g = nullptr;
+ if (layer.time_mix_g1 && layer.time_mix_g2) {
+ g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
+ }
+ ggml_tensor * a = ggml_sigmoid(
+ ctx0, ggml_add(ctx0, ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
+ layer.time_mix_a0));
+
+ ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
+ kk = ggml_l2_norm(ctx0, kk, 1e-12);
+
+ ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
+ k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
+
+ r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
+ w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
+ a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
+
+ ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
+
+ ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
+ cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+ wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, wkv_state,
+ ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
+
+ if (layer.time_mix_ln && layer.time_mix_ln_b) {
+ // group norm with head_count groups
+ cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
+ cur = ggml_norm(ctx0, cur, 64e-5f);
+
+ // Convert back to regular vectors.
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
+ } else {
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ }
+ ggml_tensor * rk = ggml_sum_rows(
+ ctx0, ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
+ cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
+
+ if (has_gating) {
+ cur = ggml_mul(ctx0, cur, g);
+ }
+ cur = build_lora_mm(layer.time_mix_output, cur);
+
+ return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_rwkv7::llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) :
+ llm_build_rwkv7_base(model, params) {
+ GGML_ASSERT(hparams.token_shift_count == 2);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * v_first = nullptr;
+
+ inpL = build_inp_embd(model.tok_embd);
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_shift =
+ ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+ ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
+ token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0, att_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
+
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+ cb(ffn_norm, "ffn_norm", il);
+
+ x_prev = ggml_concat(
+ ctx0, ffn_shift,
+ ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
+
+ token_shift = ggml_concat(ctx0,
+ ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
+ ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
+ 1);
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+ }
+ cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+template <bool iswa>
+llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+ ggml_tensor * probs = nullptr;
+
+ probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
+ cb(probs, "ffn_moe_logits", il);
+
+ // norm
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * ffn_out =
+ build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_RELU, true,
+ false, 0.0,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+ il, probs);
+
+ cb(ffn_out, "ffn_out", il);
+ cur = ffn_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+
+// Explicit template instantiations
+template struct llm_build_smallthinker<false>;
+template struct llm_build_smallthinker<true>;
--- /dev/null
+#include "models.h"
+
+llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * inpSA = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm,
+ NULL,
+ LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm,
+ NULL,
+ LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ if (model.layers[il].ffn_norm) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+ } else {
+ // parallel residual
+ cur = inpSA;
+ }
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * embd_enc = build_inp_cross_embd();
+ ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
+
+ const int64_t n_outputs_enc = embd_enc->ne[1];
+
+ auto * inp_attn_self = build_attn_inp_kv();
+ auto * inp_attn_cross = build_attn_inp_cross();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int64_t dec_n_layer = hparams.dec_n_layer;
+
+ for (int il = 0; il < dec_n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+ ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
+
+ cur = build_attn(inp_attn_self,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+ }
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "cross_inp", il);
+
+ ggml_tensor * inpCA = cur;
+
+ // norm
+ cur = build_norm(cur,
+ model.layers[il].attn_norm_cross, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm_cross", il);
+
+ // cross-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
+
+ cur = build_attn(inp_attn_cross,
+ model.layers[il].wo_cross, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+
+ //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ //cb(kq, "kq", il);
+
+ //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+ //cb(kq, "kq_soft_max_ext", il);
+
+ //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+ //cb(v, "v", il);
+
+ //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+ //cb(kqv, "kqv", il);
+
+ //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ //cb(kqv_merged, "kqv_merged", il);
+
+ //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ //cb(cur, "kqv_merged_cont", il);
+
+ //ggml_build_forward_expand(gf, cur);
+
+ //cur = build_lora_mm(model.layers[il].wo_cross, cur);
+ //cb(cur, "kqv_out", il);
+ }
+ if (il == dec_n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm_enc, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+ ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo_enc, nullptr,
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm_enc, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up_enc, NULL, NULL,
+ model.layers[il].ffn_gate_enc, NULL, NULL,
+ model.layers[il].ffn_down_enc, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = build_norm(cur,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_wavtokenizer_dec::llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_b);
+
+ // posnet
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+ const auto & layer = model.layers[il].posnet;
+
+ inpL = cur;
+
+ switch (il) {
+ case 0:
+ case 1:
+ case 3:
+ case 4:
+ {
+ cur = build_norm(cur,
+ layer.norm1,
+ layer.norm1_b,
+ LLM_NORM_GROUP, 0);
+
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.conv1_b);
+
+ cur = build_norm(cur,
+ layer.norm2,
+ layer.norm2_b,
+ LLM_NORM_GROUP, 0);
+
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.conv2_b);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ } break;
+ case 2:
+ {
+ cur = build_norm(cur,
+ layer.attn_norm,
+ layer.attn_norm_b,
+ LLM_NORM_GROUP, 0);
+
+ ggml_tensor * q;
+ ggml_tensor * k;
+ ggml_tensor * v;
+
+ q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+ k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+ v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
+
+ q = ggml_add(ctx0, q, layer.attn_q_b);
+ k = ggml_add(ctx0, k, layer.attn_k_b);
+ v = ggml_add(ctx0, v, layer.attn_v_b);
+
+ q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+ k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
+
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+ kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
+
+ cur = ggml_mul_mat(ctx0, kq, v);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.attn_o_b);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ } break;
+ case 5:
+ {
+ cur = build_norm(cur,
+ layer.norm,
+ layer.norm_b,
+ LLM_NORM_GROUP, 0);
+ } break;
+ default: GGML_ABORT("unknown posnet layer");
+ };
+ }
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ model.tok_norm,
+ model.tok_norm_b,
+ LLM_NORM, -1);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ inpL = cur;
+
+ // convnext
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+ const auto & layer = model.layers[il].convnext;
+
+ cur = inpL;
+
+ cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.dw_b);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ layer.norm,
+ layer.norm_b,
+ LLM_NORM, -1);
+
+ cur = build_ffn(cur,
+ layer.pw1, layer.pw1_b, NULL,
+ NULL, NULL, NULL,
+ layer.pw2, layer.pw2_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+
+ cur = ggml_mul(ctx0, cur, layer.gamma);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ inpL = ggml_add(ctx0, cur, inpL);
+ }
+ cur = inpL;
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cur = ggml_add(ctx0, cur, model.output_b);
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
--- /dev/null
+#include "models.h"
+
+llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}