unicode.cpp
unicode-data.cpp
${SRC_MODELS})
- target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+ target_include_directories(${TARGET} PRIVATE . ${SDL2_INCLUDE_DIRS})
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS ${TARGET} RUNTIME)
std::vector<ggml_tensor *> tensors; // per layer
};
+using llama_adapter_cvec_ptr = std::shared_ptr<llama_adapter_cvec>;
+
//
// llama_adapter_lora
//
};
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
+using llama_adapter_loras_ptr = std::unique_ptr<llama_adapter_loras>;
{ LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
+ { LLM_ARCH_EUROBERT, "eurobert" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" },
+ { LLM_ARCH_JAIS2, "jais2" },
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
+ { LLM_ARCH_PADDLEOCR, "paddleocr" },
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_STEP35, "step35" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_CLS, "cls" },
{ LLM_TENSOR_CLS_OUT, "cls.output" },
+ { LLM_TENSOR_CLS_NORM, "cls.norm" },
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_GRANITE:
case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_PADDLEOCR:
case LLM_ARCH_SMOLLM3:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
+ case LLM_ARCH_EUROBERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
case LLM_ARCH_MODERN_BERT:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_CLS_NORM,
};
case LLM_ARCH_JINA_BERT_V2:
return {
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_UP_EXPS,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_UP_EXPS,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_UP_EXPS,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};
case LLM_ARCH_GLM4_MOE:
return {
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
};
+ case LLM_ARCH_JAIS2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
case LLM_ARCH_NEMOTRON_H:
return {
LLM_TENSOR_TOKEN_EMBD,
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
LLM_ARCH_NEO_BERT,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_JINA_BERT_V3,
+ LLM_ARCH_EUROBERT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
LLM_ARCH_T5,
LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS,
+ LLM_ARCH_JAIS2,
LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_NEMOTRON_H_MOE,
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
+ LLM_ARCH_PADDLEOCR,
LLM_ARCH_MIMO2,
LLM_ARCH_STEP35,
LLM_ARCH_LLAMA_EMBED,
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_UP_EXPS,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_CLS_NORM,
LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
const llama_model & model,
llama_context_params params) :
model(model),
+ cvec(std::make_unique<llama_adapter_cvec>()),
+ loras(std::make_unique<llama_adapter_loras>()),
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
// may need to be backend-dependent
}
float * llama_context::get_logits_ith(int32_t i) {
- int64_t j = -1;
-
output_reorder();
try {
throw std::runtime_error("no logits");
}
- // TODO: use output_resolve_row()
- if (i < 0) {
- j = n_outputs + i;
- if (j < 0) {
- throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
- }
- } else if ((size_t) i >= output_ids.size()) {
- throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
- } else {
- j = output_ids[i];
- }
-
- if (j < 0) {
- throw std::runtime_error(format("batch.logits[%d] != true", i));
- }
- if (j >= n_outputs) {
- // This should not happen
- throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
- }
-
+ const int64_t j = output_resolve_row(i);
return logits.data + j*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
}
float * llama_context::get_embeddings_ith(int32_t i) {
- int64_t j = -1;
-
output_reorder();
try {
throw std::runtime_error("no embeddings");
}
- // TODO: use output_resolve_row()
- if (i < 0) {
- j = n_outputs + i;
- if (j < 0) {
- throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
- }
- } else if ((size_t) i >= output_ids.size()) {
- throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
- } else {
- j = output_ids[i];
- }
-
- if (j < 0) {
- throw std::runtime_error(format("batch.logits[%d] != true", i));
- }
- if (j >= n_outputs) {
- // This should not happen
- throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
- }
-
+ const int64_t j = output_resolve_row(i);
const uint32_t n_embd_out = model.hparams.n_embd_out();
return embd.data + j*n_embd_out;
} catch (const std::exception & err) {
return;
}
- loras.clear();
+ loras.reset(new llama_adapter_loras());
for (size_t i = 0; i < n_adapters; i ++) {
if (scales[i] != 0.0f) {
- loras[adapters[i]] = scales[i];
+ loras->insert({adapters[i], scales[i]});
}
}
bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
- if (n_adapters != loras.size()) {
+ if (n_adapters != loras->size()) {
return false;
}
for (size_t i = 0; i < n_adapters; i ++) {
- auto it = loras.find(adapters[i]);
+ auto it = loras->find(adapters[i]);
- if (it == loras.end() || it->second != scales[i]) {
+ if (it == loras->end() || it->second != scales[i]) {
return false;
}
}
// TODO: should we reserve?
- return cvec.apply(model, data, len, n_embd, il_start, il_end);
+ return cvec->apply(model, data, len, n_embd, il_start, il_end);
}
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
/*.gtype =*/ gtype,
/*.sched =*/ sched.get(),
/*.backend_cpu =*/ backend_cpu,
- /*.cvec =*/ &cvec,
- /*.loras =*/ &loras,
+ /*.cvec =*/ cvec.get(),
+ /*.loras =*/ loras.get(),
/*.mctx =*/ mctx,
/*.cross =*/ &cross,
/*.samplers =*/ sampling.samplers,
// TODO: add more model-specific info which should prevent loading the session file if not identical
}
- // write output ids
- {
- LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
-
- const auto n_outputs = this->n_outputs;
- const auto & output_ids = this->output_ids;
-
- std::vector<int32_t> w_output_pos;
-
- w_output_pos.resize(n_outputs);
-
- // build a more compact representation of the output ids
- for (size_t i = 0; i < n_batch(); ++i) {
- // map an output id to a position in the batch
- int64_t pos = output_ids[i];
- if (pos >= 0) {
- GGML_ASSERT(pos < n_outputs);
- w_output_pos[pos] = i;
- }
- }
-
- io.write(&n_outputs, sizeof(n_outputs));
-
- if (n_outputs) {
- io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
- }
- }
-
- // [TAG_CONTEXT_STATE_LOGITS]
- // write logits
- {
- LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
-
- const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens());
-
- io.write(&logits_size, sizeof(logits_size));
-
- if (logits_size) {
- io.write(logits.data, logits_size * sizeof(float));
- }
- }
-
- // write embeddings
- {
- LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
-
- const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd);
-
- io.write(&embd_size, sizeof(embd_size));
-
- if (embd_size) {
- io.write(embd.data, embd_size * sizeof(float));
- }
- }
-
- // TODO: handle sampling buffers and samplers state ?
- // https://github.com/ggml-org/llama.cpp/pull/17004
-
if (memory != nullptr) {
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
memory->state_write(io);
// TODO: add more info which needs to be identical but which is not verified otherwise
}
- // read output ids
- {
- LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
-
- auto n_outputs = this->n_outputs;
- io.read_to(&n_outputs, sizeof(n_outputs));
-
- if (n_outputs > output_reserve(n_outputs)) {
- throw std::runtime_error("could not reserve outputs");
- }
-
- std::vector<int32_t> output_pos;
-
- if (n_outputs) {
- output_pos.resize(n_outputs);
- io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
-
- for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
- int32_t id = output_pos[i];
- if ((uint32_t) id >= n_batch()) {
- throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
- }
- this->output_ids[id] = i;
- }
-
- this->n_outputs = n_outputs;
- }
- }
-
- // read logits
- {
- LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
-
- uint64_t logits_size;
- io.read_to(&logits_size, sizeof(logits_size));
-
- if (this->logits.size < logits_size) {
- throw std::runtime_error("logits buffer too small");
- }
-
- if (logits_size) {
- io.read_to(this->logits.data, logits_size * sizeof(float));
- }
- }
-
- // read embeddings
- {
- LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
-
- uint64_t embd_size;
- io.read_to(&embd_size, sizeof(embd_size));
-
- if (this->embd.size < embd_size) {
- throw std::runtime_error("embeddings buffer too small");
- }
-
- if (embd_size) {
- io.read_to(this->embd.data, embd_size * sizeof(float));
- }
- }
-
- // TODO: handle sampling buffers and samplers state ?
- // https://github.com/ggml-org/llama.cpp/pull/17004
-
if (memory) {
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
llama_set_param(model->cls_b, param_filter, param_filter_ud);
llama_set_param(model->cls_out, param_filter, param_filter_ud);
llama_set_param(model->cls_out_b, param_filter, param_filter_ud);
+ llama_set_param(model->cls_norm, param_filter, param_filter_ud);
for (struct llama_layer & layer : model->layers) {
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
const llama_model & model;
- llama_cparams cparams;
- llama_adapter_cvec cvec;
- llama_adapter_loras loras;
+ llama_cparams cparams;
+
+ llama_adapter_cvec_ptr cvec;
+ llama_adapter_loras_ptr loras;
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
#include <sstream>
#include <unordered_set>
+// dedup helpers
+
+static ggml_tensor * build_kq_mask(
+ ggml_context * ctx,
+ const llama_kv_cache_context * mctx,
+ const llama_ubatch & ubatch,
+ const llama_cparams & cparams) {
+ const auto n_kv = mctx->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+}
+
+static bool can_reuse_kq_mask(
+ ggml_tensor * kq_mask,
+ const llama_kv_cache_context * mctx,
+ const llama_ubatch & ubatch,
+ const llama_cparams & cparams) {
+ const auto n_kv = mctx->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ bool res = true;
+
+ res &= (kq_mask->ne[0] == n_kv);
+ res &= (kq_mask->ne[1] == n_tokens/n_stream);
+ res &= (kq_mask->ne[2] == 1);
+ res &= (kq_mask->ne[3] == n_stream);
+
+ return res;
+}
+
+// impl
+
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) {
const int64_t n_tokens = ubatch->n_tokens;
}
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
- if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+ if (cparams.embeddings &&
+ (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN ||
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK )) {
+
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= self_kq_mask->ne[0] == mctx->get_n_kv();
- res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(self_kq_mask, mctx, params.ubatch, params.cparams);
return res;
}
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
- res &= self_kq_mask->ne[0] == mctx->get_n_kv();
- res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(self_kq_mask, mctx, params.ubatch, params.cparams);
return res;
}
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
- res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
-
- res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
- res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+ res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
return res;
}
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
- res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
- res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
- res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
- res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams);
}
// swa tensors may not be allocated if there are no SWA attention layers
res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
- res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+ res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams);
}
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
if (down) {
cur = build_lora_mm(down, cur);
- if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
- // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
+ // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
- ggml_tensor * probs_in) const {
+ ggml_tensor * probs_in,
+ ggml_tensor * gate_up_exps) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
w_scale,
gating_op,
il,
- probs_in
+ probs_in,
+ gate_up_exps
);
}
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
- ggml_tensor * probs_in) const {
+ ggml_tensor * probs_in,
+ ggml_tensor * gate_up_exps,
+ ggml_tensor * gate_up_exps_b) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
cb(cur, "ffn_moe_weighted", il);
}
- ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
- cb(up, "ffn_moe_up", il);
+ ggml_tensor * up = nullptr;
+ ggml_tensor * experts = nullptr;
- if (up_exps_b) {
- up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
- cb(up, "ffn_moe_up_biased", il);
- }
+ if (gate_up_exps) {
+ // merged gate_up path: one mul_mat_id, then split into gate and up views
+ ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
+ cb(gate_up, "ffn_moe_gate_up", il);
- ggml_tensor * experts = nullptr;
- if (gate_exps) {
- cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ if (gate_up_exps_b) {
+ gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
+ cb(gate_up, "ffn_moe_gate_up_biased", il);
+ }
+
+ const int64_t n_ff = gate_up->ne[0] / 2;
+ cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
+ up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]);
+ cb(up, "ffn_moe_up", il);
} else {
- cur = up;
- }
+ // separate gate and up path
+ up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(up, "ffn_moe_up", il);
+
+ if (up_exps_b) {
+ up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
+ cb(up, "ffn_moe_up_biased", il);
+ }
- if (gate_exps_b) {
- cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
- cb(cur, "ffn_moe_gate_biased", il);
+ if (gate_exps) {
+ cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(cur, "ffn_moe_gate", il);
+ } else {
+ cur = up;
+ }
+
+ if (gate_exps_b) {
+ cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
+ cb(cur, "ffn_moe_gate_biased", il);
+ }
}
+ const bool has_gate = gate_exps || gate_up_exps;
+
switch (type_op) {
case LLM_FFN_SILU:
if (gate_exps) {
break;
}
}
+ }
+ if (has_gate) {
cur = ggml_swiglu_split(ctx0, cur, up);
cb(cur, "ffn_moe_swiglu", il);
} else {
cb(cur, "ffn_moe_silu", il);
} break;
case LLM_FFN_GELU:
- if (gate_exps) {
+ if (has_gate) {
cur = ggml_geglu_split(ctx0, cur, up);
cb(cur, "ffn_moe_geglu", il);
} else {
cb(cur, "ffn_moe_swiglu_oai", il);
} break;
case LLM_FFN_RELU:
- if (gate_exps) {
+ if (has_gate) {
cur = ggml_reglu_split(ctx0, cur, up);
cb(cur, "ffn_moe_reglu", il);
} else {
cb(cur, "ffn_moe_relu", il);
} break;
case LLM_FFN_RELU_SQR:
- if (gate_exps) {
+ if (has_gate) {
// TODO: add support for gated squared relu
GGML_ABORT("fatal error: gated squared relu not implemented");
} else {
ggml_tensor * cur;
- if (cparams.flash_attn && kq_b == nullptr) {
+ const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr;
+ if (use_flash_attn) {
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
if (v_trans) {
{
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
- const auto n_kv = mctx_cur->get_n_kv();
- const auto n_tokens = ubatch.n_tokens;
- const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
+
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
if (wo) {
cur = build_lora_mm(wo, cur);
- if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
- // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
+ // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
{
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
- const auto n_kv = mctx_cur->get_n_kv();
- const auto n_tokens = ubatch.n_tokens;
- const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
- const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
{
- const auto n_kv = mctx_cur->get_base()->get_n_kv();
-
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
{
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
- const auto n_kv = mctx_cur->get_swa()->get_n_kv();
-
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
- inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp->self_kq_mask_swa = build_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
ggml_set_input(inp->self_kq_mask_swa);
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
- const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
{
- const auto n_kv = attn_ctx->get_base()->get_n_kv();
-
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
- inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp_attn->self_kq_mask = build_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
ggml_set_input(inp_attn->self_kq_mask);
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
}
{
- const auto n_kv = attn_ctx->get_swa()->get_n_kv();
-
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
- inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ inp_attn->self_kq_mask_swa = build_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
ggml_set_input(inp_attn->self_kq_mask_swa);
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
void llm_graph_context::build_dense_out(
ggml_tensor * dense_2,
+ ggml_tensor * dense_2_b,
ggml_tensor * dense_3) const {
- if (!cparams.embeddings || !(dense_2 || dense_3)) {
+ if (!cparams.embeddings || !(dense_2 || dense_2_b || dense_3)) {
return;
}
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
if (dense_2) {
cur = ggml_mul_mat(ctx0, dense_2, cur);
}
+ if (dense_2_b) {
+ cur = ggml_add(ctx0, cur, dense_2_b);
+ }
if (dense_3) {
cur = ggml_mul_mat(ctx0, dense_3, cur);
}
ggml_tensor * cls,
ggml_tensor * cls_b,
ggml_tensor * cls_out,
- ggml_tensor * cls_out_b) const {
+ ggml_tensor * cls_out_b,
+ ggml_tensor * cls_norm) const {
if (!cparams.embeddings) {
return;
}
} break;
case LLAMA_POOLING_TYPE_RANK:
{
- ggml_tensor * inp_cls = build_inp_cls();
- cur = ggml_get_rows(ctx0, inp, inp_cls);
+ if (arch == LLM_ARCH_MODERN_BERT) {
+ // modern bert gte reranker builds mean first then applies prediction head and classifier
+ // https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modular_modernbert.py#L1404-1411
+ ggml_tensor * inp_mean = build_inp_mean();
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+ } else {
+ ggml_tensor * inp_cls = build_inp_cls();
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
+ }
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
if (cls_b) {
cur = ggml_add(ctx0, cur, cls_b);
}
- cur = ggml_tanh(ctx0, cur);
+ if (arch == LLM_ARCH_MODERN_BERT) {
+ cur = ggml_gelu(ctx0, cur);
+ } else {
+ cur = ggml_tanh(ctx0, cur);
+ }
+ if (cls_norm) {
+ // head norm
+ cur = build_norm(cur, cls_norm, NULL, LLM_NORM, -1);
+ }
}
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
- ggml_tensor * probs_in = nullptr) const;
+ ggml_tensor * probs_in = nullptr,
+ ggml_tensor * gate_up_exps = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
- ggml_tensor * probs_in = nullptr) const;
+ ggml_tensor * probs_in = nullptr,
+ ggml_tensor * gate_up_exps = nullptr,
+ ggml_tensor * gate_up_exps_b = nullptr) const;
//
// inputs
ggml_tensor * cls,
ggml_tensor * cls_b,
ggml_tensor * cls_out,
- ggml_tensor * cls_out_b) const;
+ ggml_tensor * cls_out_b,
+ ggml_tensor * cls_norm) const;
//
// sampling (backend sampling)
void build_dense_out(
ggml_tensor * dense_2,
+ ggml_tensor * dense_2_b,
ggml_tensor * dense_3) const;
};
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
char buf[256];
- snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+ snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
- snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
}
return buf;
}
if (model.arch == LLM_ARCH_STEP35) {
return false;
}
+ if (hparams.n_pos_per_embd() > 1) {
+ return false;
+ }
return true;
}
const auto & cell = cells[tail_id];
// partial intersection is invalid if it includes the final pos
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
- //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
return false;
}
// invalidate tails which will be cleared
add_tensor(model.cls_b);
add_tensor(model.cls_out);
add_tensor(model.cls_out_b);
+ add_tensor(model.cls_norm);
for (const struct llama_layer & layer : model.layers) {
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
case LLM_TYPE_8B_A1B: return "8B.A1B";
case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
+ case LLM_TYPE_24B_A2B: return "24B.A2B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_35B_A3B: return "35B.A3B";
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
- hparams.set_swa_pattern(swa_period);
+ hparams.set_swa_pattern(swa_period, true);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
type = LLM_TYPE_250M;
}
} break;
+ case LLM_ARCH_EUROBERT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+
+ if (hparams.n_layer == 12) {
+ type = LLM_TYPE_SMALL; // 0.2B
+ }
+ } break;
case LLM_ARCH_BLOOM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
} break;
case LLM_ARCH_DEEPSEEK2:
{
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
- const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+ // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
+ const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+ // NextN/MTP parameters (GLM-OCR)
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ // TODO: when MTP is implemented, this should probably be updated if needed
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
switch (hparams.n_layer) {
+ case 17: type = LLM_TYPE_1B; break; // GLM-OCR
case 40: type = LLM_TYPE_9B; break;
case 61: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_JAIS2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_8B; break;
+ case 68: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_NEMOTRON:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
} break;
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
+ case LLM_ARCH_PADDLEOCR:
{
+ // paddleocr need mrope_section
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
if (arch == LLM_ARCH_ERNIE4_5_MOE) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
case 10752: type = LLM_TYPE_2_6B; break;
default: type = LLM_TYPE_UNKNOWN;
}
+ if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+ hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
+ }
+ }
} break;
case LLM_ARCH_LFM2MOE:
{
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
}
- type = LLM_TYPE_8B_A1B;
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_8B_A1B; break;
+ case 40: type = LLM_TYPE_24B_A2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
} break;
case LLM_ARCH_SMALLTHINKER:
{
// TODO: move to a separate function
const auto tn = LLM_TN(arch);
+
+ // helper: try merged gate_up_exps first, fall back to separate gate and up
+ auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
+ layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
+ if (layer.ffn_gate_up_exps == nullptr) {
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
+ }
+ };
switch (arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT:
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
}
- cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_norm = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_NEO_BERT:
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
}
} break;
+ case LLM_ARCH_EUROBERT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ }
+ } break;
case LLM_ARCH_JINA_BERT_V2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
}
// MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
// Shared expert branch
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
}
} break;
+ case LLM_ARCH_JAIS2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // attention biases - all have shape n_embd (output dimension of projections)
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ // Jais-2 uses simple MLP (no gate) with biases
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+ }
+ } break;
case LLM_ARCH_CHATGLM:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
}
for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
}
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+ // Optional tensors
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+ }
}
} break;
case LLM_ARCH_GLM4_MOE:
} break;
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
+ case LLM_ARCH_PADDLEOCR:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
}
// for LFM2-ColBert-350M
- dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
+ dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_SMALLTHINKER:
{
}
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
// Shared experts
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
}
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
// Shared experts
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
+ case LLM_ARCH_EUROBERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_GEMMA_EMBEDDING:
{
llm = std::make_unique<llm_build_neo_bert>(*this, params);
} break;
+ case LLM_ARCH_EUROBERT:
+ {
+ llm = std::make_unique<llm_build_eurobert>(*this, params);
+ } break;
case LLM_ARCH_BLOOM:
{
llm = std::make_unique<llm_build_bloom>(*this, params);
{
llm = std::make_unique<llm_build_jais>(*this, params);
} break;
+ case LLM_ARCH_JAIS2:
+ {
+ llm = std::make_unique<llm_build_jais2>(*this, params);
+ } break;
case LLM_ARCH_NEMOTRON:
{
llm = std::make_unique<llm_build_nemotron>(*this, params);
{
llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
} break;
+ case LLM_ARCH_PADDLEOCR:
+ {
+ llm = std::make_unique<llm_build_paddleocr>(*this, params);
+ } break;
case LLM_ARCH_HUNYUAN_MOE:
{
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE:
{
- llm = std::make_unique<llm_build_lfm2>(*this, params);
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+ llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
+ } else {
+ llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
+ }
} break;
case LLM_ARCH_SMALLTHINKER:
{
}
// add on pooling layer
- llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
+ llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
// add backend sampling layers (if any)
llm->build_sampling();
// there will be two additional dense projection layers
// dense linear projections are applied after pooling
// TODO: move reranking logic here and generalize
- llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
+ llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
llm->res->set_outputs();
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
+ case LLM_ARCH_EUROBERT:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:
case LLM_ARCH_BAILINGMOE2:
case LLM_ARCH_DOTS1:
case LLM_ARCH_HUNYUAN_MOE:
+ case LLM_ARCH_JAIS2:
case LLM_ARCH_OPENAI_MOE:
case LLM_ARCH_HUNYUAN_DENSE:
case LLM_ARCH_LFM2:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
+ case LLM_ARCH_PADDLEOCR:
return LLAMA_ROPE_TYPE_MROPE;
case LLM_ARCH_QWEN3VL:
case LLM_ARCH_QWEN3VLMOE:
LLM_TYPE_8B_A1B, // lfm2moe
LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small
+ LLM_TYPE_24B_A2B, // lfm2moe
LLM_TYPE_30B_A3B,
LLM_TYPE_31B_A3_5B,
LLM_TYPE_35B_A3B, // Qwen3.5
struct ggml_tensor * ffn_up_enc = nullptr;
// ff MoE
- struct ggml_tensor * ffn_gate_inp = nullptr;
- struct ggml_tensor * ffn_gate_exps = nullptr;
- struct ggml_tensor * ffn_down_exps = nullptr;
- struct ggml_tensor * ffn_up_exps = nullptr;
- struct ggml_tensor * ffn_gate_inp_b = nullptr;
- struct ggml_tensor * ffn_gate_exps_b = nullptr;
- struct ggml_tensor * ffn_down_exps_b = nullptr;
- struct ggml_tensor * ffn_up_exps_b = nullptr;
+ struct ggml_tensor * ffn_gate_inp = nullptr;
+ struct ggml_tensor * ffn_gate_exps = nullptr;
+ struct ggml_tensor * ffn_down_exps = nullptr;
+ struct ggml_tensor * ffn_up_exps = nullptr;
+ struct ggml_tensor * ffn_gate_up_exps = nullptr;
+ struct ggml_tensor * ffn_gate_inp_b = nullptr;
+ struct ggml_tensor * ffn_gate_exps_b = nullptr;
+ struct ggml_tensor * ffn_down_exps_b = nullptr;
+ struct ggml_tensor * ffn_up_exps_b = nullptr;
+ struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
// ff shared expert (shexp)
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * cls_b = nullptr;
struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr;
+ struct ggml_tensor * cls_norm = nullptr;
struct ggml_tensor * conv1d = nullptr;
struct ggml_tensor * conv1d_b = nullptr;
//Dense linear projections for SentenceTransformers models like embeddinggemma
// For Sentence Transformers models structure see
// https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
- struct ggml_tensor * dense_2_out_layers = nullptr;
- struct ggml_tensor * dense_3_out_layers = nullptr;
+ struct ggml_tensor * dense_2_out_layers = nullptr;
+ struct ggml_tensor * dense_2_out_layers_b = nullptr;
+ struct ggml_tensor * dense_3_out_layers = nullptr;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
return new_size;
}
+static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
+ return (
+ dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+ dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
+ dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M ||
+ ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
+ dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
+ )
+ );
+}
+
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
};
const auto tn = LLM_TN(model.arch);
- new_ofstream(0);
+
+ // no output file for --dry-run
+ if (!params->dry_run) {
+ new_ofstream(0);
+ }
+
+ // flag for `--dry-run`, to let the user know if imatrix will be required for a real
+ // quantization, as a courtesy
+ bool will_require_imatrix = false;
+
for (const auto * it : tensors) {
const auto & weight = *it;
ggml_tensor * tensor = weight.tensor;
- if (weight.idx != cur_split && params->keep_split) {
+ if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
}
const std::string name = ggml_get_name(tensor);
+ const size_t tensor_size = ggml_nbytes(tensor);
- if (!ml.use_mmap) {
- if (read_data.size() < ggml_nbytes(tensor)) {
- read_data.resize(ggml_nbytes(tensor));
+ if (!params->dry_run) {
+ if (!ml.use_mmap) {
+ if (read_data.size() < tensor_size) {
+ read_data.resize(tensor_size);
+ }
+ tensor->data = read_data.data();
}
- tensor->data = read_data.data();
+ ml.load_data_for(tensor);
}
- ml.load_data_for(tensor);
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
quantize = tensor->type != new_type;
}
- if (!quantize) {
- new_type = tensor->type;
- new_data = tensor->data;
- new_size = ggml_nbytes(tensor);
- LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+ // we have now decided on the target type for this tensor
+ if (params->dry_run) {
+ // the --dry-run option calculates the final quantization size without quantizting
+ if (quantize) {
+ new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
+ tensor_size/1024.0/1024.0,
+ new_size/1024.0/1024.0,
+ ggml_type_name(new_type));
+ if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ will_require_imatrix = true;
+ }
+ } else {
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
+ }
+ total_size_org += tensor_size;
+ total_size_new += new_size;
+ continue;
} else {
- const int64_t nelements = ggml_nelements(tensor);
+ // no --dry-run, perform quantization
+ if (!quantize) {
+ new_type = tensor->type;
+ new_data = tensor->data;
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
+ } else {
+ const int64_t nelements = ggml_nelements(tensor);
- const float * imatrix = nullptr;
- if (imatrix_data) {
- auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
- if (it == imatrix_data->end()) {
- LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
- } else {
- if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
- imatrix = it->second.data();
+ const float * imatrix = nullptr;
+ if (imatrix_data) {
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+ if (it == imatrix_data->end()) {
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
- LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
-
- // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
- // this is a significant error and it may be good idea to abort the process if this happens,
- // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
- // tok_embd should be ignored in this case, since it always causes this warning
- if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
- throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+ imatrix = it->second.data();
+ } else {
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
+
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+ // this is a significant error and it may be good idea to abort the process if this happens,
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+ // tok_embd should be ignored in this case, since it always causes this warning
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ }
}
}
}
- }
- if ((new_type == GGML_TYPE_IQ2_XXS ||
- new_type == GGML_TYPE_IQ2_XS ||
- new_type == GGML_TYPE_IQ2_S ||
- new_type == GGML_TYPE_IQ1_S ||
- (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
- (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
- LLAMA_LOG_ERROR("\n\n============================================================\n");
- LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
- LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
- LLAMA_LOG_ERROR("============================================================\n\n");
- throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
- }
+ if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+ LLAMA_LOG_ERROR("============================================================\n\n");
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+ }
- float * f32_data;
+ float * f32_data;
- if (tensor->type == GGML_TYPE_F32) {
- f32_data = (float *) tensor->data;
- } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
- throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
- } else {
- llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
- f32_data = (float *) f32_conv_buf.data();
- }
+ if (tensor->type == GGML_TYPE_F32) {
+ f32_data = (float *) tensor->data;
+ } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+ } else {
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+ f32_data = (float *) f32_conv_buf.data();
+ }
- LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
- fflush(stdout);
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+ fflush(stdout);
- if (work.size() < (size_t)nelements * 4) {
- work.resize(nelements * 4); // upper bound on size
- }
- new_data = work.data();
+ if (work.size() < (size_t)nelements * 4) {
+ work.resize(nelements * 4); // upper bound on size
+ }
+ new_data = work.data();
- const int64_t n_per_row = tensor->ne[0];
- const int64_t nrows = tensor->ne[1];
+ const int64_t n_per_row = tensor->ne[0];
+ const int64_t nrows = tensor->ne[1];
- static const int64_t min_chunk_size = 32 * 512;
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+ static const int64_t min_chunk_size = 32 * 512;
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
- const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
- const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
- const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
- // quantize each expert separately since they have different importance matrices
- new_size = 0;
- for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
- const float * f32_data_03 = f32_data + i03 * nelements_matrix;
- void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
- const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+ // quantize each expert separately since they have different importance matrices
+ new_size = 0;
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
- new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
- // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
#if 0
- if (new_type == GGML_TYPE_MXFP4) {
- auto * x = f32_data_03;
-
- //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
- std::vector<float> deq(nrows*n_per_row);
- const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
- qtype->to_float(new_data_03, deq.data(), deq.size());
-
- double err = 0.0f;
- for (int i = 0; i < (int) deq.size(); ++i) {
- err += fabsf(deq[i] - x[i]);
- //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
- if (deq[i] != x[i]) {
- LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ if (new_type == GGML_TYPE_MXFP4) {
+ auto * x = f32_data_03;
+
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+ std::vector<float> deq(nrows*n_per_row);
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+ qtype->to_float(new_data_03, deq.data(), deq.size());
+
+ double err = 0.0f;
+ for (int i = 0; i < (int) deq.size(); ++i) {
+ err += fabsf(deq[i] - x[i]);
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+ if (deq[i] != x[i]) {
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ }
}
+ //LLAMA_LOG_INFO("err = %f\n", err);
+ GGML_ASSERT(err == 0.00000);
}
- //LLAMA_LOG_INFO("err = %f\n", err);
- GGML_ASSERT(err == 0.00000);
- }
#endif
+ }
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
}
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
- }
- total_size_org += ggml_nbytes(tensor);
- total_size_new += new_size;
+ total_size_org += tensor_size;
+ total_size_new += new_size;
+
+ // update the gguf meta data as we go
+ gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+
+ // write tensor data + padding
+ fout.write((const char *) new_data, new_size);
+ zeros(fout, GGML_PAD(new_size, align) - new_size);
+ } // no --dry-run
+ } // iterate over tensors
+
+ if (!params->dry_run) {
+ close_ofstream();
+ }
- // update the gguf meta data as we go
- gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
- GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
- // write tensor data + padding
- fout.write((const char *) new_data, new_size);
- zeros(fout, GGML_PAD(new_size, align) - new_size);
+ if (!params->imatrix && params->dry_run && will_require_imatrix) {
+ LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
+ __func__
+ );
}
- close_ofstream();
-
- LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
- LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
/*.only_copy =*/ false,
/*.pure =*/ false,
/*.keep_split =*/ false,
+ /*.dry_run =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_JAIS2:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
+
+ // adapted: same as llama3 but with cascading whitespace pattern
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
+ };
+ break;
case LLAMA_VOCAB_PRE_TYPE_DBRX:
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
regex_exprs = {
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
+ case LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM:
regex_exprs = {
"\\p{N}{1,3}",
"[一-龥-ゟ゠-ヿ]+",
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
+ regex_exprs = {
+ // original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
+ "\\d{1,3}(?=(?:\\d{3})*\\b)",
+ // original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
regex_exprs = {
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
tokenizer_pre == "falcon-h1" ||
tokenizer_pre == "pixtral" ||
tokenizer_pre == "midm-2.0" ||
- tokenizer_pre == "lfm2") {
+ tokenizer_pre == "lfm2" ||
+ tokenizer_pre == "jina-v5-nano") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0" ||
tokenizer_pre == "mellum" ||
- tokenizer_pre == "modern-bert" ) {
+ tokenizer_pre == "modern-bert") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ } else if (
+ tokenizer_pre == "jais-2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
- tokenizer_pre == "gpt-4o" ||
- tokenizer_pre == "llama4") {
+ tokenizer_pre == "gpt-4o" ||
+ tokenizer_pre == "llama4" ||
+ tokenizer_pre == "kanana2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false;
+ } else if (
+ tokenizer_pre == "tiny_aya") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
+ clean_spaces = false;
} else if (
tokenizer_pre == "superbpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
tokenizer_pre == "hunyuan-dense") {
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
clean_spaces = false;
+ } else if (
+ tokenizer_pre == "joyai-llm") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM;
+ clean_spaces = false;
} else if (
tokenizer_pre == "kimi-k2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|| t.first == "<|calls|>" // solar-open
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
+ || t.first == "</s>" // paddleocr
|| t.first == "<|eom_id|>"
|| t.first == "<EOT>"
|| t.first == "_<EOT>"
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
+ LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
+ LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
+ LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
};
struct LLM_KV;
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
+ bool dry_run; // calculate and show the final quantization size without performing quantization
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
+ il,
+ nullptr,
+ model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
// FFN shared expert
--- /dev/null
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+// utility to get one slice from the third dimension
+// input dim: [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+ return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+llm_build_delta_net_base::llm_build_delta_net_base(const llm_graph_params & params) : llm_graph_context(params) {}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+ const bool kda = (g->ne[0] == S_k && g->ne[1] == H_k);
+
+ GGML_ASSERT(S_k == S_v);
+ GGML_ASSERT(H_v % H_k == 0);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
+
+ GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
+ GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
+ GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
+ GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
+
+ const float scale = 1.0f / sqrtf(S_k);
+
+ q = ggml_scale(ctx0, q, scale);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(b, "b_in", il);
+ cb(g, "g_in", il);
+
+ q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
+ k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
+ v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
+ g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
+ b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
+
+ const int CS = CHUNK_SIZE;
+
+ const int pad = (CS - n_tokens % CS) % CS;
+ const int n_chunks = (n_tokens + pad) / CS;
+
+ q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+ k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+ v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+ g = ggml_pad(ctx0, g, 0, pad, 0, 0);
+ b = ggml_pad(ctx0, b, 0, pad, 0, 0);
+
+ ggml_tensor * v_b = ggml_mul(ctx0, v, b);
+ ggml_tensor * k_b = ggml_mul(ctx0, k, b);
+
+ cb(v_b, "v_b", il);
+ cb(k_b, "k_b", il);
+
+ q = ggml_reshape_4d(ctx0, q, S_k, CS, n_chunks, H_k * n_seqs);
+ k = ggml_reshape_4d(ctx0, k, S_k, CS, n_chunks, H_k * n_seqs);
+ k_b = ggml_reshape_4d(ctx0, k_b, S_k, CS, n_chunks, H_v * n_seqs);
+ v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
+ v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
+
+ g = ggml_reshape_4d(ctx0, g, g->ne[0], CS, n_chunks, H_v * n_seqs);
+ b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
+
+ // [CS, g_0, n_chunks, H_v * n_seqs]
+ // TODO: extend ggml_cumsum with axis parameter to avoid transpose
+ ggml_tensor * g_cs = ggml_cumsum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, g)));
+ cb(g_cs, "g_cs", il);
+
+ ggml_tensor * kb = nullptr;
+ ggml_tensor * kq = nullptr;
+ if (kda) {
+ const int64_t CHB = n_chunks * H_k * n_seqs;
+
+ ggml_tensor * g_cs_i = ggml_reshape_4d(ctx0, g_cs, CS, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
+ ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, S_k, CHB); // [1, chunk_size, S_k, CHB]
+
+ g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
+
+ // decay_mask [chunk_size,chunk_size,S_k,CHB]
+ ggml_tensor * decay_mask;
+ decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
+ decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ cb(decay_mask, "decay_mask", il);
+
+ // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
+ decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, CS, CS, CHB);
+
+ ggml_tensor * k_b_i = ggml_reshape_4d(ctx0, k_b, S_k, CS, 1, CHB);
+ ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, CS, CHB);
+ ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, CS, 1, CHB);
+
+ ggml_tensor * decay_k_b_i = ggml_mul(ctx0, decay_mask, k_b_i);
+ ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
+
+ // decay_k_b_i [S,BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
+ kb = ggml_mul_mat(ctx0, decay_k_b_i, k_j);
+ kq = ggml_mul_mat(ctx0, decay_q_i, k_j);
+
+ kb = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kb, CS, CS, n_chunks, H_v * n_seqs)));
+ kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kq, CS, CS, n_chunks, H_v * n_seqs)));
+ } else {
+ ggml_tensor * g_cs_i = g_cs;
+ ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
+
+ g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
+
+ // [CS, CS, n_chunks, H_v * n_seqs]
+ ggml_tensor * decay_mask;
+ decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
+ decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ cb(decay_mask, "decay_mask", il);
+
+ // [CS, CS, n_chunks, H_k * n_seqs]
+ kb = ggml_mul_mat(ctx0, k, k_b);
+ kb = ggml_mul (ctx0, kb, decay_mask);
+
+ // [CS, CS, n_chunks, H_k * n_seqs]
+ kq = ggml_mul_mat(ctx0, k, q);
+ kq = ggml_mul(ctx0, kq, decay_mask);
+ }
+
+ kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
+ cb(kq, "kq", il);
+
+ // [CS, CS, n_chunks, H_k * n_seqs]
+ ggml_tensor * attn;
+ attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
+ cb(attn, "attn", il);
+
+ ggml_tensor * identity;
+ identity = ggml_view_1d(ctx0, attn, CS, 0);
+ identity = ggml_fill (ctx0, identity, 1.0f);
+ identity = ggml_diag (ctx0, identity);
+
+ ggml_tensor * lhs = ggml_add(ctx0, attn, identity);
+ cb(lhs, "dnet_add_ch_lhs", il);
+
+ attn = ggml_neg(ctx0, attn);
+ cb(attn, "attn_pre_solve", il);
+
+ ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+ attn = ggml_add(ctx0, lin_solve, identity);
+ cb(attn, "dnet_add_ch_attn_solved", il); // [CS, CS, n_chunks, H_k * n_seqs]
+
+ // [S_v, CS, n_chunks, H_v * n_seqs]
+ v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
+
+ // [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
+ ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
+
+ k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
+
+ // [CS, S_k, n_chunks, H_k * n_seqs]
+ ggml_tensor * kbg = ggml_mul(ctx0, k_b, g_exp);
+ cb(kbg, "k_beta_g_exp", il);
+
+ // [S_k, CS, n_chunks, H_k * n_seqs]
+ ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
+ cb(k_cd, "k_cumdecay", il);
+
+ // [1, CS, n_chunks, H_k * n_seqs] KDA: [S_k, CS, n_chunks, H_k * n_seqs]
+ ggml_tensor * g_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_exp));
+ ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
+
+ // vectorized calculation of key_gdiff
+ // improved from the chunked version:
+ // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+ // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+ // key_gdiff = key * g_diff.unsqueeze(-1)
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+ // get last element in g_cumsum along CS dimension (ne0)
+ // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+ // [1, 1, n_chunks, H_v * n_seqs] KDA: [1, S_k, n_chunks, H_v * n_seqs]
+ ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, g_cs->ne[1], g_cs->ne[2], g_cs->ne[3],
+ g_cs->nb[1],
+ g_cs->nb[2],
+ g_cs->nb[3],
+ ggml_row_size(g_cs->type, g_cs->ne[0] - 1));
+ cb(g_last, "g_last", il);
+
+ // TODO: remove this cont when CUDA supports non-cont unary ops
+ g_last = ggml_cont(ctx0, g_last);
+
+ // [1, 1, n_chunks, H_v * n_seqs] KDA: [S_k, 1, n_chunks, H_v * n_seqs]
+ ggml_tensor * g_last_exp_t = ggml_transpose(ctx0, ggml_exp(ctx0, g_last));
+ cb(g_last_exp_t, "g_last_exp_t", il);
+
+ // [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
+ ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
+ cb(g_diff, "g_diff", il);
+
+ ggml_tensor * g_diff_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_exp(ctx0, g_diff)));
+
+ // [S_k, CS, n_chunks, H_v * n_seqs]
+ ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
+ cb(kg, "key_gdiff", il);
+
+ // [CS, S_k, n_chunks, H_v * n_seqs]
+ ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
+ cb(kg_t, "key_gdiff_t", il);
+
+ ggml_tensor * s_t = ggml_transpose(ctx0, s);
+ s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
+ cb(s_t, "dnet_add_ch_state", il);
+
+ // [CS, S_v, n_chunks, H_v * n_seqs]
+ ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
+
+ for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+ ggml_tensor * ch_k_cd = get_slice_2d(ctx0, k_cd, chunk); // [S_k, CS, 1, H_k * n_seqs]
+ ggml_tensor * ch_v_t = get_slice_2d(ctx0, v_t, chunk); // [ CS, S_v, 1, H_v * n_seqs]
+ ggml_tensor * ch_kq = get_slice_2d(ctx0, kq, chunk); // [ CS, CS, 1, H_k * n_seqs]
+ ggml_tensor * ch_q_g_exp = get_slice_2d(ctx0, q_g_exp, chunk); // [S_k, CS, 1, H_k * n_seqs]
+ ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
+
+ // [CS, S_v, 1, H_v * n_seqs]
+ ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
+ cb(v_t_p, "v_prime", il);
+
+ // [CS, S_v, 1, H_v * n_seqs]
+ ggml_tensor * v_t_new = ggml_sub(ctx0, ch_v_t, v_t_p);
+ cb(v_t_new, "v_t_new", il);
+
+ // [S_v, CS, 1, H_v * n_seqs]
+ ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_t_new, ch_kq);
+ cb(v_attn, "v_attn", il);
+
+ // [S_v, CS, 1, H_v * n_seqs]
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
+ cb(attn_inter, "attn_inter", il);
+
+ // [S_v, CS, 1, H_v * n_seqs]
+ ggml_tensor * o_ch = ggml_add(ctx0, attn_inter, v_attn);
+ cb(o_ch, "dnet_add_ch_attn_out", il);
+
+ v = ggml_set_inplace(ctx0, v, o_ch, v->nb[1], v->nb[2], v->nb[3], chunk * v->nb[2]);
+
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ // TODO: head broadcast might not work here - probably will need a transpose
+ ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
+
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+ ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
+
+ s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
+ s_t = ggml_add(ctx0, s_t, kgv);
+ cb(s_t, "dnet_add_ch_state", il);
+ }
+
+ s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
+
+ // truncate padded tokens
+ ggml_tensor * o = ggml_view_4d(ctx0, v,
+ S_v, n_tokens, H_v, n_seqs,
+ ggml_row_size(v->type, S_v),
+ ggml_row_size(v->type, S_v * CS * n_chunks),
+ ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
+ o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
+ s = ggml_transpose(ctx0, s_t);
+ cb(s, "output_state", il);
+
+ return {o, s};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b, // beta
+ ggml_tensor * s, // state
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(n_tokens == 1);
+
+ GGML_ASSERT(S_k == S_v);
+ GGML_ASSERT(H_v % H_k == 0);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
+
+ GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
+ GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
+ GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
+ GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
+
+ const float scale = 1.0f / sqrtf(S_k);
+
+ q = ggml_scale(ctx0, q, scale);
+
+ q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
+ k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
+ v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(b, "b_in", il);
+ cb(g, "g_in", il);
+
+ // GDA: [1, 1, H_v, n_seqs]
+ // KDA: [1, S_k, H_v, n_seqs]
+ g = ggml_reshape_4d(ctx0, g, 1, g->ne[0], H_v, n_seqs);
+ b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
+
+ // [S_v, S_v, H_v, n_seqs]
+ g = ggml_exp(ctx0, g);
+ s = ggml_mul(ctx0, s, g);
+
+ ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
+
+ // [1, S_v, H_v, n_seqs]
+ ggml_tensor * sk;
+ sk = ggml_mul (ctx0, s_t, k);
+ sk = ggml_sum_rows(ctx0, sk);
+
+ // [S_v, 1, H_v, n_seqs]
+ ggml_tensor * d;
+ d = ggml_sub(ctx0, v, ggml_transpose(ctx0, sk));
+ d = ggml_mul(ctx0, d, b);
+
+ // [1, S_v, H_v, n_seqs]
+ ggml_tensor * d_t;
+ d_t = ggml_transpose(ctx0, d);
+
+ // [S_v, S_v, H_v, n_seqs]
+ ggml_tensor * kd;
+ k = ggml_repeat(ctx0, k, s);
+ kd = ggml_mul (ctx0, k, d_t);
+
+ s_t = ggml_add(ctx0, s_t, kd);
+
+ cb(s_t, "dnet_add_ar_state", il);
+
+ ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
+ ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
+
+ o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
+ s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
+
+ return {o, s};
+}
--- /dev/null
+#include "models.h"
+
+llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "inp_embd", -1);
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * cur = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
+
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = ggml_add(ctx0, cur, inpL);
+
+ ggml_tensor * ffn_inp = cur;
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
#include "models.h"
-
-
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
+ llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
ggml_tensor * cur;
ggml_tensor * inp_out_ids = build_inp_out_ids();
- for (int il = 0; il < n_layer; ++il) {
+ // Only process up to last layer (skip final NextN layer)
+ // Final layer tensors are loaded but not processed in forward pass
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL;
// Pre-attention norm
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
- if (il == n_layer - 1 && inp_out_ids) {
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "post_mlp_norm", il);
}
- // Add residual connection after post-MLP norm
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
// Final norm
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
+ llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+++ /dev/null
-#include "models.h"
-
-llm_graph_context_mamba::llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
-
-ggml_tensor * llm_graph_context_mamba::build_mamba_layer(llm_graph_input_rs * inp,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) {
- const auto * mctx_cur = inp->mctx;
-
- const auto kv_head = mctx_cur->get_head();
-
- const auto & layer = model.layers[il];
-
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t dt_rank = hparams.ssm_dt_rank;
- const int64_t n_head = d_inner;
- const int64_t head_dim = 1;
- const int64_t n_seqs = ubatch.n_seqs;
- // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
- const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
-
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
- ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
- ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
- // split the above in two
- // => {d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
- ggml_tensor * z =
- ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner * ggml_element_size(xz));
-
- // conv
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
-
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
- n_seq_tokens * (conv_x->nb[0]));
-
- ggml_build_forward_expand(
- gf, ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all, (d_conv - 1) * (d_inner) * (n_seqs),
- kv_head * (d_conv - 1) * (d_inner) *ggml_element_size(conv_states_all))));
-
- // 1D convolution
- // The equivalent is to make a self-overlapping view of conv_x
- // over d_conv columns at each stride in the 3rd dimension,
- // then element-wise multiply that with the conv1d weight,
- // then sum the elements of each row,
- // (the last two steps are a dot product over rows (also doable with mul_mat))
- // then permute away the ne[0] dimension,
- // and then you're left with the resulting x tensor.
- // For simultaneous sequences, all sequences need to have the same length.
- x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
-
- // bias
- x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
-
- x = ggml_silu(ctx0, x);
- }
-
- // ssm
- {
- // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
- ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
- // split
- ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
- ggml_tensor * B =
- ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
- x_db->nb[2], ggml_element_size(x_db) * dt_rank);
- ggml_tensor * C =
- ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
- x_db->nb[2], ggml_element_size(x_db) * (dt_rank + d_state));
-
- // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
- if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
- dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
- B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
- C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
- }
-
- // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
- dt = build_lora_mm(layer.ssm_dt, dt);
- dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
-
- cur = x;
- x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
-
- ggml_tensor * A = layer.ssm_a;
-
- // use the states and the indices provided by build_recurrent_state
- // (this is necessary in order to properly use the states before they are overwritten,
- // while avoiding to make unnecessary copies of the states)
- auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
-
- // Custom operator to optimize the parallel associative scan
- // as described in the Annex D of the Mamba paper.
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
- };
-
- ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
-
- // store last states
- ggml_build_forward_expand(
- gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, x->nb[3] * x->ne[3]),
- ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
- kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
-
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
-
- // TODO: skip computing output earlier for unused tokens
-
- y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
- y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
-
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(layer.ssm_out, y);
- }
-
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
-
- return cur;
-}
-
-ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * inp,
- ggml_tensor * cur,
- const llama_model & model,
- const llama_ubatch & ubatch,
- int il) const {
- const auto * mctx_cur = inp->mctx;
-
- const auto kv_head = mctx_cur->get_head();
-
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t n_head = hparams.ssm_dt_rank;
- const int64_t head_dim = d_inner / n_head;
- const int64_t n_group = hparams.ssm_n_group;
- const int64_t n_seqs = ubatch.n_seqs;
-
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
- ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
- ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-
- // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
- ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
-
- // split the above in three
- ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
- zxBCdt->nb[1], zxBCdt->nb[2], 0);
- ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2 * n_group * d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1],
- zxBCdt->nb[2], d_inner * ggml_element_size(zxBCdt));
- ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2],
- (2 * d_inner + 2 * n_group * d_state) * ggml_element_size(zxBCdt));
-
- // conv
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
-
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs,
- conv_x->nb[1], conv_x->nb[2], n_seq_tokens * (conv_x->nb[0]));
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all,
- (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
- kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
- ggml_element_size(conv_states_all))));
-
- // 1D convolution
- // The equivalent is to make a self-overlapping view of conv_x
- // over d_conv columns at each stride in the 3rd dimension,
- // then element-wise multiply that with the conv1d weight,
- // then sum the elements of each row,
- // (the last two steps are a dot product over rows (also doable with mul_mat))
- // then permute away the ne[0] dimension,
- // and then you're left with the resulting x tensor.
- // For simultaneous sequences, all sequences need to have the same length.
- xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
-
- // bias
- xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
-
- xBC = ggml_silu(ctx0, xBC);
- }
-
- // ssm
- {
- // These correspond to V K Q in SSM/attention duality
- ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * xBC->nb[0],
- xBC->nb[1], xBC->nb[2], 0);
- ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
- xBC->nb[1], xBC->nb[2], d_inner * ggml_element_size(xBC));
- ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
- xBC->nb[1], xBC->nb[2], (d_inner + n_group * d_state) * ggml_element_size(xBC));
-
- // {n_head, n_seq_tokens, n_seqs}
- dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
-
- ggml_tensor * A = model.layers[il].ssm_a;
-
- // use the states and the indices provided by build_recurrent_state
- // (this is necessary in order to properly use the states before they are overwritten,
- // while avoiding to make unnecessary copies of the states)
- auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
-
- // TODO: use semistructured matrices to implement state-space duality
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
- };
-
- ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
-
- // store last states
- ggml_build_forward_expand(
- gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, ggml_nelements(x) * x->nb[0]),
- ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
- kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
-
- ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head * x->nb[1],
- n_seq_tokens * n_head * x->nb[1], 0);
-
- // TODO: skip computing output earlier for unused tokens
-
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
- cb(y, "mamba2_y_add_d", il);
- y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
-
- // grouped RMS norm
- if (model.layers[il].ssm_norm) {
- y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
- y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
- }
-
- y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
-
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(model.layers[il].ssm_out, y);
- }
-
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
- cb(cur, "mamba_out", il);
-
- return cur;
-}
--- /dev/null
+#include "models.h"
+
+// JAIS-2 model graph builder
+// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
+llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // KV input for attention
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // Pre-attention LayerNorm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // Self-attention with separate Q, K, V projections
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur_bias", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur_bias", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur_bias", il);
+
+ // Reshape for attention
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur_rope", il);
+ cb(Kcur, "Kcur_rope", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // Residual connection
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // Pre-FFN LayerNorm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ // FFN with relu2 activation (ReLU squared) - no gate projection
+ // up -> relu2 -> down
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL, // no gate
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // Residual connection
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ inpL = build_cvec(inpL, il);
+ cb(inpL, "l_out", il);
+ }
+
+ // Final LayerNorm
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+ cb(cur, "result_norm", -1);
+
+ res->t_embd = cur;
+
+ // Output projection
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
#include "models.h"
-llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
ggml_tensor * cur;
#include "models.h"
#include "ggml.h"
-#define CHUNK_SIZE 64
+#include "llama-memory-recurrent.h"
// Causal Conv1d function for Q,K,V
// When qkv is 0, it is Q, 1 is K, 2 is V
}
llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params), model(model) {
+ llm_build_delta_net_base(params), model(model) {
ggml_tensor * cur;
ggml_tensor * inpL;
// Output ids for selecting which tokens to output
ggml_tensor * inp_out_ids = build_inp_out_ids();
- ggml_tensor * chunked_causal_mask =
- ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
- GGML_TRI_TYPE_LOWER);
-
- ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
- ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity);
-
- ggml_build_forward_expand(gf, chunked_causal_mask);
- ggml_build_forward_expand(gf, chunked_identity);
- ggml_build_forward_expand(gf, chunked_diag_mask);
-
// Kimi dimension constants
const int64_t n_head = hparams.n_head();
const int64_t head_dim = hparams.n_embd_head_kda;
cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
+ ggml_build_forward_expand(gf, cur);
+
// Check layer type by checking which tensors exist
// KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
bool is_kda = (layer.ssm_a != nullptr);
g1 = ggml_mul(ctx0, g1, A);
cb(g1, "kda_g1", il);
+ g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
+
// Compute beta (mixing coefficient)
ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
- beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, n_head, n_seq_tokens, n_seqs);
cb(beta, "kda_beta", il);
+ beta = ggml_sigmoid(ctx0, beta);
+
// Reshape for KDA recurrence
// {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
- g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
-
// Get SSM state and compute KDA recurrence using ggml_kda_scan
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
- // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ Qcur = ggml_l2_norm(ctx0, Qcur, eps_norm);
+ Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
+
+ // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
- build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
- build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il);
+ build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
+ build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
- ggml_tensor * output = attn_out.first;
+ ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
cb(new_state, "new_state", il);
ggml_build_forward_expand(gf, cur);
}
-
-/*
- This is a ggml implementation of the naive_chunk_kda function of
- https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
-*/
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * gk,
- ggml_tensor * beta,
- ggml_tensor * state,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
- int il) {
- GGML_ASSERT(ggml_is_contiguous(state));
-
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- // TODO: can this ever be false?
- const bool use_qk_l2norm = true;
-
- if (use_qk_l2norm) {
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
- }
-
- const float scale = 1.0f / sqrtf(S_v);
-
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(gk, "gk_in", il);
-
- q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
- k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
- v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-
- beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
- state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
- cb(q, "q_perm", il);
- cb(k, "k_perm", il);
- cb(v, "v_perm", il);
- cb(beta, "beta_perm", il);
- cb(gk, "gk_perm", il);
- cb(state, "state_in", il);
-
- GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
- GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
- GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
-
- // Do padding
- const int64_t chunk_size = CHUNK_SIZE;
-
- const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
- const int64_t n_chunks = (n_tokens + pad) / chunk_size;
-
- q = ggml_pad(ctx0, q, 0, pad, 0, 0);
- k = ggml_pad(ctx0, k, 0, pad, 0, 0);
- v = ggml_pad(ctx0, v, 0, pad, 0, 0);
- gk = ggml_pad(ctx0, gk, 0, pad, 0, 0);
- beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
-
- cb(q, "q_pad", il);
- cb(k, "k_pad", il);
- cb(v, "v_pad", il);
- cb(beta, "beta_pad", il);
- cb(gk, "gk_pad", il);
-
- ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
- ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-
- cb(v_beta, "v_beta", il);
- cb(k_beta, "k_beta", il);
-
- const int64_t HB = H_k * n_seqs;
-
- q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB);
- k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, HB);
- k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB);
- v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, HB);
- v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB);
-
- gk = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB);
- beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB);
-
- // switch for cumsum
- gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB);
- cb(gk, "gk", il);
- ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
- cb(gk_cumsum, "gk_cumsum", il);
-
-/*
- Compute Akk and Aqk loop together
- Akk loop:
- for i in range(BT):
- k_i = k[..., i, :] # k_i [B,H,NT,S]
- g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
- A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
- Aqk loop:
- for j in range(BT):
- k_j = k[:, :, i, j]
- g_j = g[:, :, i, j:j+1, :]
- A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
-*/
- const int64_t CHB = n_chunks * H_k * n_seqs;
- ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
- ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB]
-
- ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
- // decay_mask [chunk_size,chunk_size,S_k,CHB]
- ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i);
- cb(decay_mask, "decay_mask", il);
-
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
- cb(decay_mask, "decay_masked", il);
- decay_mask = ggml_exp(ctx0, decay_mask);
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
- // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
- decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
-
- ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB);
- ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
- ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
-
- ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
- ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
-
- // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
- ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j);
- ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j);
- Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB)));
- Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB)));
- cb(Akk, "Akk", il);
- cb(Aqk, "Aqk", il);
-
- Akk = ggml_mul(ctx0, Akk, beta);
- Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
- cb(Akk, "attn_pre_solve", il);
-
- Aqk = ggml_mul(ctx0, Aqk, diag_mask);
- Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
- cb(Aqk, "Aqk_masked", il);
-
- // for i in range(1, chunk_size):
- // row = attn[..., i, :i].clone()
- // sub = attn[..., :i, :i].clone()
- // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
- // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
- //
- // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
- ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
- ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-
- ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
- Akk = ggml_mul(ctx0, lin_solve, causal_mask);
- Akk = ggml_add(ctx0, Akk, identity);
-
- cb(Akk, "attn_solved", il);
-
- // switch back for downstream
- gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
- ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum);
- cb(gk_cumsum, "gk_cumsum", il);
-
- // u = (A*beta[..., None, :]) @ v aka U_[t]
- ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
-
- ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
- cb(kbeta_gkexp, "kbeta_gkexp", il);
-
- ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
- cb(k_cumdecay, "k_cumdecay", il);
-
- ggml_tensor * core_attn_out = nullptr;
- ggml_tensor * new_state = ggml_dup(ctx0, state);
-
- cb(new_state, "new_state", il);
-
- for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-// extract one chunk worth of data
- auto chunkify = [=](ggml_tensor * t) {
- return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
- t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
- };
- auto chunkify_A = [=](ggml_tensor * t) {
- return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3],
- t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
- };
-
-
-// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B]
- ggml_tensor * k_chunk = chunkify(k);
- ggml_tensor * q_chunk = chunkify(q);
- ggml_tensor * vb_chunk = chunkify(vb);
-
-// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B]
- ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
- ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
- ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
- ggml_tensor * Aqk_chunk = chunkify_A(Aqk);
-
- ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
-
- // new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
- // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
- ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
-
- // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
- ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime);
- ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
-
- // q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
- // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
- // or Gamma_[t]*Q_]t] @ S
- ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk);
- ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
- attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
-
- // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
- // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
- ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk);
-
- // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
- ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
-
- core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
-
- ggml_tensor * gk_cum_last =
- ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3],
- gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
- gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1)));
-
- ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
-
- ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last));
-
- ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
-
- ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp);
-
- // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
- ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
-
- new_state = ggml_add(ctx0,
- ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)),
- ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
- }
-
- core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
-
- // truncate padded tokens
- ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
- S_v, n_tokens, H_v, n_seqs,
- ggml_row_size(core_attn_out->type, S_v),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
- output_tokens = ggml_cont(ctx0, output_tokens);
- // permute back to (S_v, H_v, n_tokens, n_seqs)
- output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
- output_tokens = ggml_cont(ctx0, output_tokens);
-
- cb(new_state, "output_state", il);
-
- return {output_tokens, new_state};
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * gk,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il) {
- GGML_ASSERT(ggml_is_contiguous(v));
- GGML_ASSERT(ggml_is_contiguous(gk));
-
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(n_tokens == 1);
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
-
- const float scale = 1.0f / sqrtf(S_v);
-
- q = ggml_scale(ctx0, q, scale);
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(gk, "gk_in", il);
-
-// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B]
-// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B]
-// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B]
- gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs);
- ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk));
- ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
-
- // Apply exponential to gk_t
- gk_t = ggml_exp(ctx0, gk_t);
- // Apply the gated delta rule for the single timestep
- // last_recurrent_state = last_recurrent_state * gk_t
- // S = S * g_i[..., None].exp()
- state = ggml_mul(ctx0, state, gk_t);
-
- ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
-
-// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B]
- k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
- ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k);
-
- // v_i - (k_i[..., None] * S).sum(-2)
- v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
- ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state);
-
- // b_i[..., None] * k_i
- ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t);
-
- // S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2))
- // v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B]
- state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
-
- q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
- state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
- ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
- // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
- cb(core_attn_out, "output_tokens", il);
- cb(state, "new_state", il);
-
- return {core_attn_out, state};
-}
-
#include "models.h"
+#include "../llama-memory-hybrid-iswa.h"
#include "../llama-memory-hybrid.h"
-
-llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context(params),
- model(model) {
+template <bool iswa>
+llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ using inp_hybrid_type = std::conditional_t<iswa, llm_graph_input_mem_hybrid_iswa, llm_graph_input_mem_hybrid>;
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ using mem_hybrid_ctx = std::conditional_t<iswa, llama_memory_hybrid_iswa_context, llama_memory_hybrid_context>;
+
+ // lambda helpers for readability
+ auto build_dense_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
+ return build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ };
+ auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
+ return build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+ };
+ auto build_attn_block = [&model, this](ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ inp_attn_type * inp_attn,
+ int il) -> ggml_tensor * {
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
+ const auto n_embd_head = hparams.n_embd_head_v;
+ const auto n_head_kv = hparams.n_head_kv(il);
+
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
+
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
+
+ // qk norm
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
+
+ // RoPE
+ q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+ k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
+
+ return cur;
+ };
+ auto build_shortconv_block = [&model, this](ggml_tensor * cur,
+ llm_graph_input_rs * inp_recr,
+ int il) -> ggml_tensor * {
+ const auto * mctx_cur = static_cast<const mem_hybrid_ctx *>(mctx)->get_recr();
+ const uint32_t kv_head = mctx_cur->get_head();
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
+ const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
+
+ constexpr auto n_chunks = 3;
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
+ const auto chunk_size = bcx->ne[0] / n_chunks;
+ auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 0 * chunk_size * ggml_element_size(bcx));
+ auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 1 * chunk_size * ggml_element_size(bcx));
+ auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 2 * chunk_size * ggml_element_size(bcx));
+
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
+
+ // read conv state
+ auto * conv_state = mctx_cur->get_r_l(il);
+ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
+ auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
+
+ bx = ggml_concat(ctx0, conv, bx, 0);
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
+
+ // last d_conv columns is a new conv state
+ auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
+ (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
+
+ // write new conv conv state
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
+ ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
+ kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
+
+ auto * conv_kernel = model.layers[il].shortconv.conv;
+ auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
+ cb(conv_out, "model.layers.{}.conv.conv", il);
+
+ auto * y = ggml_mul(ctx0, c, conv_out);
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
+ cb(y, "model.layers.{}.conv.out_proj", il);
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
+
+ return y;
+ };
+
+ // actual graph construction starts here
ggml_tensor * cur = build_inp_embd(model.tok_embd);
cb(cur, "model.embed_tokens", -1);
ggml_build_forward_expand(gf, cur);
+ inp_hybrid_type * inp_hybrid = nullptr;
+ if constexpr (iswa) {
+ inp_hybrid = build_inp_mem_hybrid_iswa();
+ } else {
+ inp_hybrid = build_inp_mem_hybrid();
+ }
+
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_hybrid = build_inp_mem_hybrid();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_build_forward_expand(gf, cur);
}
-ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
- return build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
-}
-
-ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
- GGML_ASSERT(!model.layers[il].ffn_up_b);
- GGML_ASSERT(!model.layers[il].ffn_gate_b);
- GGML_ASSERT(!model.layers[il].ffn_down_b);
- return build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-}
-
-ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor * cur,
- ggml_tensor * inp_pos,
- llm_graph_input_attn_kv * inp_attn,
- int il) const {
- GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
- const auto n_embd_head = hparams.n_embd_head_v;
- const auto n_head_kv = hparams.n_head_kv(il);
-
- auto * q = build_lora_mm(model.layers[il].wq, cur);
- cb(q, "model.layers.{}.self_attn.q_proj", il);
- auto * k = build_lora_mm(model.layers[il].wk, cur);
- cb(k, "model.layers.{}.self_attn.k_proj", il);
- auto * v = build_lora_mm(model.layers[il].wv, cur);
- cb(v, "model.layers.{}.self_attn.v_proj", il);
-
- q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
- k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
- v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
-
- // qk norm
- q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(q, "model.layers.{}.self_attn.q_layernorm", il);
- k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(k, "model.layers.{}.self_attn.k_layernorm", il);
-
- // RoPE
- q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
- attn_factor, beta_fast, beta_slow);
- k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
- attn_factor, beta_fast, beta_slow);
-
- cur = build_attn(inp_attn,
- model.layers[il].wo, NULL,
- q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
-
- cb(cur, "model.layers.{}.self_attn.out_proj", il);
-
- return cur;
-}
-
-ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) {
- const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
- const uint32_t kv_head = mctx_cur->get_head();
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
- const int64_t n_seqs = ubatch.n_seqs;
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs());
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
- GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
- const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
-
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
- auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
- cb(bcx, "model.layers.{}.conv.in_proj", il);
-
- constexpr auto n_chunks = 3;
- GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
- const auto chunk_size = bcx->ne[0] / n_chunks;
- auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
- 0 * chunk_size * ggml_element_size(bcx));
- auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
- 1 * chunk_size * ggml_element_size(bcx));
- auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
- 2 * chunk_size * ggml_element_size(bcx));
-
- auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
-
- // read conv state
- auto * conv_state = mctx_cur->get_r_l(il);
- auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
- auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
-
- bx = ggml_concat(ctx0, conv, bx, 0);
- GGML_ASSERT(bx->ne[0] > conv->ne[0]);
-
- // last d_conv columns is a new conv state
- auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
- (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
- GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
-
- // write new conv conv state
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
- ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
- kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
-
- auto * conv_kernel = model.layers[il].shortconv.conv;
- auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
- cb(conv_out, "model.layers.{}.conv.conv", il);
-
- auto * y = ggml_mul(ctx0, c, conv_out);
- y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
- cb(y, "model.layers.{}.conv.out_proj", il);
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
-
- return y;
-}
+// Explicit template instantiations
+template struct llm_build_lfm2<true>;
+template struct llm_build_lfm2<false>;
--- /dev/null
+#include "models.h"
+
+#include "llama-memory-recurrent.h"
+
+llm_build_mamba_base::llm_build_mamba_base(const llm_graph_params & params) : llm_graph_context(params) {}
+
+ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+ const int64_t n_head = d_inner;
+ const int64_t head_dim = 1;
+ const int64_t n_seqs = ubatch.n_seqs;
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
+ // split the above in two
+ // => {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+ ggml_tensor * z =
+ ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner * ggml_element_size(xz));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
+ n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all, (d_conv - 1) * (d_inner) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner) *ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
+
+ // bias
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
+
+ x = ggml_silu(ctx0, x);
+ }
+
+ // ssm
+ {
+ // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
+ // split
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+ ggml_tensor * B =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * dt_rank);
+ ggml_tensor * C =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * (dt_rank + d_state));
+
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+ dt = build_lora_mm(layer.ssm_dt, dt);
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
+
+ cur = x;
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
+
+ ggml_tensor * A = layer.ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // Custom operator to optimize the parallel associative scan
+ // as described in the Annex D of the Mamba paper.
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, x->nb[3] * x->ne[3]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(layer.ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_head = hparams.ssm_dt_rank;
+ const int64_t head_dim = d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
+
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
+
+ // split the above in three
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
+ zxBCdt->nb[1], zxBCdt->nb[2], 0);
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2 * n_group * d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1],
+ zxBCdt->nb[2], d_inner * ggml_element_size(zxBCdt));
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2],
+ (2 * d_inner + 2 * n_group * d_state) * ggml_element_size(zxBCdt));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs,
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all,
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
+ ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+
+ // bias
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
+
+ xBC = ggml_silu(ctx0, xBC);
+ }
+
+ // ssm
+ {
+ // These correspond to V K Q in SSM/attention duality
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], 0);
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], d_inner * ggml_element_size(xBC));
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], (d_inner + n_group * d_state) * ggml_element_size(xBC));
+
+ // {n_head, n_seq_tokens, n_seqs}
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
+
+ ggml_tensor * A = model.layers[il].ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // TODO: use semistructured matrices to implement state-space duality
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, ggml_nelements(x) * x->nb[0]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head * x->nb[1],
+ n_seq_tokens * n_head * x->nb[1], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+ cb(y, "mamba2_y_add_d", il);
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // grouped RMS norm
+ if (model.layers[il].ssm_norm) {
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ cb(cur, "mamba_out", il);
+
+ return cur;
+}
#include "models.h"
-
-llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
#pragma once
-#include "../llama-model.h"
-#include "../llama-graph.h"
+#include "llama-model.h"
+#include "llama-graph.h"
-// TODO: remove in follow-up PR - move to .cpp files
-#include "../llama-memory-recurrent.h"
+// note: almost all graphs require atleast sqrtf, so include cmath globally
#include <cmath>
-struct llm_graph_context_mamba : public llm_graph_context {
- llm_graph_context_mamba(const llm_graph_params & params);
+//
+// base classes
+//
- virtual ~llm_graph_context_mamba() = default;
+struct llm_build_mamba_base : public llm_graph_context {
+ llm_build_mamba_base(const llm_graph_params & params);
+
+ virtual ~llm_build_mamba_base() = default;
ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const;
};
-// Base class for RWKV-related models
+struct llm_build_delta_net_base : public llm_graph_context {
+ llm_build_delta_net_base(const llm_graph_params & params);
+
+ virtual ~llm_build_delta_net_base() = default;
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il);
+};
+
struct llm_build_rwkv6_base : public llm_graph_context {
const llama_model & model;
int il) const;
};
+//
+// models
+//
+
struct llm_build_afmoe : public llm_graph_context {
llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
};
llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_paddleocr : public llm_graph_context {
+ llm_build_paddleocr(const llama_model & model, const llm_graph_params & params);
+};
+
template <bool iswa>
struct llm_build_exaone4 : public llm_graph_context {
llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
llm_build_falcon(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_falcon_h1 : public llm_graph_context_mamba {
+struct llm_build_falcon_h1 : public llm_build_mamba_base {
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
};
const int il);
};
-struct llm_build_granite_hybrid : public llm_graph_context_mamba {
+struct llm_build_granite_hybrid : public llm_build_mamba_base {
llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
llm_build_jais(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_jamba : public llm_graph_context_mamba {
+struct llm_build_jais2 : public llm_graph_context {
+ llm_build_jais2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_jamba : public llm_build_mamba_base {
llm_build_jamba(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_kimi_linear : public llm_graph_context_mamba {
+struct llm_build_kimi_linear : public llm_build_delta_net_base {
llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
const llama_model & model;
};
+template <bool iswa>
struct llm_build_lfm2 : public llm_graph_context {
- const llama_model & model;
-
llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
- ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
- ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
- ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
- ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
-
};
struct llm_build_llada : public llm_graph_context {
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_mamba : public llm_graph_context_mamba {
+struct llm_build_mamba : public llm_build_mamba_base {
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
};
llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_nemotron_h : public llm_graph_context_mamba {
+struct llm_build_nemotron_h : public llm_build_mamba_base {
llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
- ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il);
+ ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il);
ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
- const llama_model & model, const int64_t n_embd_head, const int il);
+ const llama_model & model, int64_t n_embd_head, int il);
};
struct llm_build_neo_bert : public llm_graph_context {
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_eurobert : public llm_graph_context {
+ llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
+};
+
template <bool iswa>
struct llm_build_olmo2 : public llm_graph_context {
llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
llm_build_phi3(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_plamo2 : public llm_graph_context_mamba {
+struct llm_build_plamo2 : public llm_build_mamba_base {
llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
};
-struct llm_build_qwen3next : public llm_graph_context_mamba {
+struct llm_build_qwen3next : public llm_build_delta_net_base {
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_layer_attn(
ggml_tensor * cur,
int il);
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il);
-
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il);
-
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
const llama_model & model;
};
-struct llm_build_qwen35 : public llm_graph_context_mamba {
+struct llm_build_qwen35 : public llm_build_delta_net_base {
llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_layer_attn(
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
int il);
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il);
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
- int il);
-
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il);
-
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
const llama_model & model;
};
-struct llm_build_qwen35moe : public llm_graph_context_mamba {
+// TODO: derive llm_build_delta_net_base instead
+struct llm_build_qwen35moe : public llm_build_delta_net_base {
llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_layer_attn(
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
int il);
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il);
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
- int il);
-
- // returns pair of output and new state
- std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il);
-
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
LLM_NORM, -1);
cb(cur, "final_norm_out", -1);
- if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
- // extracting cls token
- cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
- cb(cur, "cls_pooled_embd", -1);
- }
-
- cb(cur, "res_embd", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}
#include "models.h"
-
-
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
+ llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor * cur,
llm_graph_input_attn_kv * inp_attn,
const llama_model & model,
- const int64_t n_embd_head,
- const int il) {
+ int64_t n_embd_head,
+ int il) {
// compute Q and K
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
return cur;
}
-ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
+ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
--- /dev/null
+#include "models.h"
+
+llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+
+ // NOTE: same with qwen2vl.cpp, but bias tensors are optional
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
#include "models.h"
+#include "llama-memory-recurrent.h"
+
llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params) {
+ llm_build_mamba_base(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
-#include "ggml.h"
#include "models.h"
-#define CHUNK_SIZE 64
+#include "llama-memory-recurrent.h"
llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params), model(model) {
+ llm_build_delta_net_base(params), model(model) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
- ggml_tensor * causal_mask =
- ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
- GGML_TRI_TYPE_LOWER);
-
- ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
- ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
-
- ggml_build_forward_expand(gf, causal_mask);
- ggml_build_forward_expand(gf, identity);
- ggml_build_forward_expand(gf, diag_mask);
-
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
+ ggml_build_forward_expand(gf, cur);
+
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
- cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+ cur = build_layer_attn_linear(inp->get_recr(), cur, il);
} else {
// Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
ggml_build_forward_expand(gf, cur);
}
-// utility to get one slice from the third dimension
-// input dim: [x, y, c, b]
-// output dim: [x, y, 1, b]
-static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
- return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
- t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
-
- const float scale = 1.0f / sqrtf(S_v);
-
- q = ggml_scale(ctx0, q, scale);
-
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(g, "g_in", il);
-
- q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
-
- beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
- state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
- cb(q, "q_perm", il);
- cb(k, "k_perm", il);
- cb(v, "v_perm", il);
- cb(beta, "beta_perm", il);
- cb(g, "g_perm", il);
- cb(state, "state_in", il);
-
- GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
- GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
- GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
-
- // Do padding
- const int64_t chunk_size = CHUNK_SIZE;
-
- const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
- const int64_t n_chunks = (n_tokens + pad) / chunk_size;
-
- q = ggml_pad(ctx0, q, 0, pad, 0, 0);
- k = ggml_pad(ctx0, k, 0, pad, 0, 0);
- v = ggml_pad(ctx0, v, 0, pad, 0, 0);
- g = ggml_pad(ctx0, g, pad, 0, 0, 0);
- beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
-
- cb(q, "q_pad", il);
- cb(k, "k_pad", il);
- cb(v, "v_pad", il);
- cb(beta, "beta_pad", il);
- cb(g, "g_pad", il);
-
- ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
- ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-
- cb(v_beta, "v_beta", il);
- cb(k_beta, "k_beta", il);
-
- q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
- k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
- k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
- v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
- v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
-
- g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
- beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
-
- ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
- cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
- ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
-
- ggml_tensor * gcs_j_broadcast =
- ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
-
- ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
- cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
- decay_mask = ggml_exp(ctx0, decay_mask);
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
- ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
-
- ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
- ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
- cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
- ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-
- ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
- attn = ggml_mul(ctx0, lin_solve, causal_mask);
- attn = ggml_add(ctx0, attn, identity);
- cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
-
- ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
- ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
-
- ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
- cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * k_cumdecay =
- ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
- cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
- attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
- attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
- cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-
- // vectorized calculation of key_gdiff
- // improved from the chunked version:
- // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
- // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
- // key_gdiff = key * g_diff.unsqueeze(-1)
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
- // get last element in g_cumsum along chunk_size dimension (ne0)
- // example: [[x, y, z, ..., last], ...] -> [[last], ...]
- ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
- g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
- (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
- g_last = ggml_cont(ctx0, g_last);
- cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
- cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
- cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
- ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
- 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
-
- ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
- cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
- cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
-
- // state to be updated per chunk
- ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
- cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
-
- // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
- ggml_tensor * core_attn_out = nullptr;
-
- for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
- // shape: (S_k, chunk_size, 1, H_k * n_seqs)
- ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
-
- // shape: (S_v, chunk_size, 1, H_v * n_seqs)
- ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
-
- // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
- ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
-
- // shape: (chunk_size, 1, H_v * n_seqs)
- ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
-
- // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
- // replaced by precomputed attn_kq
- ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
- cb(attn_chunk, "attn_chunk", il);
-
- ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
-
- // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
- ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
- cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
-
- // v_new = v_i - v_prime
- ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
- ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
- cb(v_new, "v_new_chunk", il);
-
- // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
- ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
- ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
- cb(attn_inter, "attn_inter_chunk", il);
-
- // core_attn_out[:, :, i] = attn_inter + attn @ v_new
- ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
- cb(v_attn, "v_attn_chunk", il);
-
- ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
- cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
-
- core_attn_out = core_attn_out == nullptr
- ? core_attn_out_chunk
- : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
-
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
- //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
- ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
-
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
- ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
- new_state = ggml_add(ctx0,
- ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
- ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
- }
-
- // truncate padded tokens
- ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
- S_v, n_tokens, H_v, n_seqs,
- ggml_row_size(core_attn_out->type, S_v),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
- output_tokens = ggml_cont(ctx0, output_tokens);
- cb(output_tokens, "output_tokens", il);
-
- // permute back to (S_v, H_v, n_tokens, n_seqs)
- output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
- output_tokens = ggml_cont(ctx0, output_tokens);
-
- return {output_tokens, new_state};
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
-
- const float scale = 1.0f / sqrtf(S_v);
-
- q = ggml_scale(ctx0, q, scale);
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(g, "g_in", il);
-
- state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
- ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
- ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
-
- // Apply exponential to g_t
- g_t = ggml_exp(ctx0, g_t);
-
- // Apply the gated delta rule for the single timestep
- // last_recurrent_state = last_recurrent_state * g_t
- state = ggml_mul(ctx0, state, g_t);
-
- // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
- ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
- ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
- // we need to sum over dim=-2, so we transpose, sum, then transpose again
- kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
-
- // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
- ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
- // delta = (v_t - kv_mem) * beta_t
- ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
- ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
-
- // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
- ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
- state = ggml_add(ctx0, state, k_t_delta);
-
- // Compute the attention output
- // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
- ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
- ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
- // again, since it's over dim = -2, transpose, sum, transpose back
- ggml_tensor * core_attn_out =
- ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
-
- // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
- cb(core_attn_out, "output_tokens", il);
- cb(state, "new_state", il);
-
- return {core_attn_out, state};
-}
-
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
ggml_tensor * input,
int il) {
ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
int il) {
const auto * mctx_cur = inp->mctx;
ggml_tensor * z = qkvz.second;
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
- beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
cb(beta, "beta", il);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
cb(alpha, "alpha", il);
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
+
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
+ gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
+
// Get convolution states from cache
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
- // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
-
// Build the convolution states tensor
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0];
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
- conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
- qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
- cb(qkv_mixed, "qkv_mixed_permuted", il);
+ qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
+ cb(qkv_mixed, "qkv_mixed_transposed", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
cb(state_update_target, "state_update_target", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
- cb(conv_states_all, "conv_states_updated", il);
- // Apply SSM convolution
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
+ cb(state, "state_predelta", il);
+
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il);
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output
- ggml_tensor * q_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+ ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_k_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ 0);
+
+ ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_k_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+
+ ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_v_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
+
cb(q_conv, "q_conv", il);
- ggml_tensor * k_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
- head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il);
- ggml_tensor * v_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
- 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il);
- // Unsqueeze them
- q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
- k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
- v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+ const float eps_norm = hparams.f_norm_rms_eps;
- ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
- state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
- cb(state, "state_predelta", il);
+ q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
+ k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
- // if head keys and value keys are different, repeat Q/K to match V's head count
- // V heads are in tiled order (from conversion), so simple tiled repeat works
+ //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+ // if head keys and value keys are different, repeat to force tensors into matching shapes
if (num_k_heads != num_v_heads) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
+ // TODO: try to avoid these explicit repeats by utilizing op broadcast
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
}
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
- attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+ attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
}
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
// Update the recurrent states
ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, new_state,
- ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
- kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
-
- // Reshape both attn_out_final and z to 2D tensors for normalization
- // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
- ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
- ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+ ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
- ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+ ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
- cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+
return cur;
}
-#include "ggml.h"
#include "models.h"
-#define CHUNK_SIZE 64
+#include "llama-memory-recurrent.h"
llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params), model(model) {
+ llm_build_delta_net_base(params), model(model) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
- ggml_tensor * causal_mask =
- ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
- GGML_TRI_TYPE_LOWER);
-
- ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
- ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
-
- ggml_build_forward_expand(gf, causal_mask);
- ggml_build_forward_expand(gf, identity);
- ggml_build_forward_expand(gf, diag_mask);
-
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
+ ggml_build_forward_expand(gf, cur);
+
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
- cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+ cur = build_layer_attn_linear(inp->get_recr(), cur, il);
} else {
// Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
ggml_build_forward_expand(gf, cur);
}
-// utility to get one slice from the third dimension
-// input dim: [x, y, c, b]
-// output dim: [x, y, 1, b]
-static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
- return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
- t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
-
- const float scale = 1.0f / sqrtf(S_v);
-
- q = ggml_scale(ctx0, q, scale);
-
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(g, "g_in", il);
-
- q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
- g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
-
- beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
- state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
- cb(q, "q_perm", il);
- cb(k, "k_perm", il);
- cb(v, "v_perm", il);
- cb(beta, "beta_perm", il);
- cb(g, "g_perm", il);
- cb(state, "state_in", il);
-
- GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
- GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
- GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
-
- // Do padding
- const int64_t chunk_size = CHUNK_SIZE;
-
- const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
- const int64_t n_chunks = (n_tokens + pad) / chunk_size;
-
- q = ggml_pad(ctx0, q, 0, pad, 0, 0);
- k = ggml_pad(ctx0, k, 0, pad, 0, 0);
- v = ggml_pad(ctx0, v, 0, pad, 0, 0);
- g = ggml_pad(ctx0, g, pad, 0, 0, 0);
- beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
-
- cb(q, "q_pad", il);
- cb(k, "k_pad", il);
- cb(v, "v_pad", il);
- cb(beta, "beta_pad", il);
- cb(g, "g_pad", il);
-
- ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
- ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-
- cb(v_beta, "v_beta", il);
- cb(k_beta, "k_beta", il);
-
- q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
- k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
- k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
- v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
- v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
-
- g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
- beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
-
- ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
- cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
- ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
-
- ggml_tensor * gcs_j_broadcast =
- ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
-
- ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
- cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
- decay_mask = ggml_exp(ctx0, decay_mask);
- decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
- ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
-
- ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
- ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
- cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
- ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-
- ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
- attn = ggml_mul(ctx0, lin_solve, causal_mask);
- attn = ggml_add(ctx0, attn, identity);
- cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
-
- ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
- ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
-
- ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
- cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * k_cumdecay =
- ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
- cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
- attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
- attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
- cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-
- // vectorized calculation of key_gdiff
- // improved from the chunked version:
- // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
- // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
- // key_gdiff = key * g_diff.unsqueeze(-1)
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
- // get last element in g_cumsum along chunk_size dimension (ne0)
- // example: [[x, y, z, ..., last], ...] -> [[last], ...]
- ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
- g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
- (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
- g_last = ggml_cont(ctx0, g_last);
- cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
- cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
- cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
- ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
- ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
- 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
-
- ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
- cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
- ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
- cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
-
-
- // state to be updated per chunk
- ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
- cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
-
- // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
- ggml_tensor * core_attn_out = nullptr;
-
- for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
- // shape: (S_k, chunk_size, 1, H_k * n_seqs)
- ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
-
- // shape: (S_v, chunk_size, 1, H_v * n_seqs)
- ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
-
- // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
- ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
-
- // shape: (chunk_size, 1, H_v * n_seqs)
- ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
-
- // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
- // replaced by precomputed attn_kq
- ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
- cb(attn_chunk, "attn_chunk", il);
-
- ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
-
- // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
- ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
- cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
-
- // v_new = v_i - v_prime
- ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
- ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
- cb(v_new, "v_new_chunk", il);
-
- // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
- ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
- ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
- cb(attn_inter, "attn_inter_chunk", il);
-
- // core_attn_out[:, :, i] = attn_inter + attn @ v_new
- ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
- cb(v_attn, "v_attn_chunk", il);
-
- ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
- cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
-
- core_attn_out = core_attn_out == nullptr
- ? core_attn_out_chunk
- : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
-
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
- //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
- ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
-
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
- ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
- new_state = ggml_add(ctx0,
- ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
- ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
- }
-
- // truncate padded tokens
- ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
- S_v, n_tokens, H_v, n_seqs,
- ggml_row_size(core_attn_out->type, S_v),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
- ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
- output_tokens = ggml_cont(ctx0, output_tokens);
- cb(output_tokens, "output_tokens", il);
-
- // permute back to (S_v, H_v, n_tokens, n_seqs)
- output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
- output_tokens = ggml_cont(ctx0, output_tokens);
-
- return {output_tokens, new_state};
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * beta,
- ggml_tensor * state,
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
- GGML_ASSERT(v->ne[2] == n_tokens);
- GGML_ASSERT(k->ne[2] == n_tokens);
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
- GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
- GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
-
- const float eps_norm = hparams.f_norm_rms_eps;
-
- q = ggml_l2_norm(ctx0, q, eps_norm);
- k = ggml_l2_norm(ctx0, k, eps_norm);
-
- const float scale = 1.0f / sqrtf(S_v);
-
- q = ggml_scale(ctx0, q, scale);
- beta = ggml_sigmoid(ctx0, beta);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(beta, "beta_in", il);
- cb(g, "g_in", il);
-
- state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
- ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
- ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
-
- // Apply exponential to g_t
- g_t = ggml_exp(ctx0, g_t);
-
- // Apply the gated delta rule for the single timestep
- // last_recurrent_state = last_recurrent_state * g_t
- state = ggml_mul(ctx0, state, g_t);
-
- // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
- ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
- ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
- // we need to sum over dim=-2, so we transpose, sum, then transpose again
- kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
-
- // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
- ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
- // delta = (v_t - kv_mem) * beta_t
- ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
- ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
-
- // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
- ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
- state = ggml_add(ctx0, state, k_t_delta);
-
- // Compute the attention output
- // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
- ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
- ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
- // again, since it's over dim = -2, transpose, sum, transpose back
- ggml_tensor * core_attn_out =
- ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
-
- // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
- cb(core_attn_out, "output_tokens", il);
- cb(state, "new_state", il);
-
- return {core_attn_out, state};
-}
-
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
ggml_tensor * input,
int il) {
ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
- ggml_tensor * causal_mask,
- ggml_tensor * identity,
- ggml_tensor * diag_mask,
int il) {
const auto * mctx_cur = inp->mctx;
ggml_tensor * z = qkvz.second;
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
- beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
cb(beta, "beta", il);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
cb(alpha, "alpha", il);
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
+
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
+ gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
+
// Get convolution states from cache
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
- // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
-
// Build the convolution states tensor
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0];
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
- conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
- qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
- cb(qkv_mixed, "qkv_mixed_permuted", il);
+ qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
+ cb(qkv_mixed, "qkv_mixed_transposed", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
cb(state_update_target, "state_update_target", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
- cb(conv_states_all, "conv_states_updated", il);
- // Apply SSM convolution
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
+ cb(state, "state_predelta", il);
+
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il);
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output
- ggml_tensor * q_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+ ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_k_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ 0);
+
+ ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_k_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+
+ ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
+ ggml_row_size(conv_qkv_mix->type, head_v_dim),
+ nb1_qkv,
+ nb1_qkv * n_seq_tokens,
+ ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
+
cb(q_conv, "q_conv", il);
- ggml_tensor * k_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
- head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il);
- ggml_tensor * v_conv =
- ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
- 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il);
- // Unsqueeze them
- q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
- k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
- v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+ const float eps_norm = hparams.f_norm_rms_eps;
- ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
- state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
- cb(state, "state_predelta", il);
+ q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
+ k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
- // if head keys and value keys are different, repeat Q/K to match V's head count
- // V heads are in tiled order (from conversion), so simple tiled repeat works
+ //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+ // if head keys and value keys are different, repeat to force tensors into matching shapes
if (num_k_heads != num_v_heads) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
+ // TODO: try to avoid these explicit repeats by utilizing op broadcast
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
}
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
- attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+ attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
}
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
// Update the recurrent states
ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, new_state,
- ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
- kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
-
- // Reshape both attn_out_final and z to 2D tensors for normalization
- // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
- ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
- ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+ ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
- ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+ ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
- cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+
return cur;
}
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ nullptr, model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
// Add shared experts if present - following Qwen3Next reference implementation
-#include "ggml.h"
#include "models.h"
-#define CHUNK_SIZE 64
+#include "llama-memory-recurrent.h"
llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
- llm_graph_context_mamba(params), model(model) {
+ llm_build_delta_net_base(params), model(model) {
ggml_tensor * cur;
ggml_tensor * inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
+ ggml_build_forward_expand(gf, cur);
+
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
}
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * b,
- ggml_tensor * s,
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(S_k == S_v);
- GGML_ASSERT(H_v % H_k == 0);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
- GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
-
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
- GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
-
- const float scale = 1.0f / sqrtf(S_k);
-
- q = ggml_scale(ctx0, q, scale);
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(b, "b_in", il);
- cb(g, "g_in", il);
-
- q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
- k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
- v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
- g = ggml_permute(ctx0, g, 2, 1, 3, 0); // [ 1, n_tokens, H_v, n_seqs]
- b = ggml_permute(ctx0, b, 2, 0, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
-
- const int CS = CHUNK_SIZE;
-
- const int pad = (CS - n_tokens % CS) % CS;
- const int n_chunks = (n_tokens + pad) / CS;
-
- q = ggml_pad(ctx0, q, 0, pad, 0, 0);
- k = ggml_pad(ctx0, k, 0, pad, 0, 0);
- v = ggml_pad(ctx0, v, 0, pad, 0, 0);
- g = ggml_pad(ctx0, g, 0, pad, 0, 0);
- b = ggml_pad(ctx0, b, 0, pad, 0, 0);
-
- ggml_tensor * v_b = ggml_mul(ctx0, v, b);
- ggml_tensor * k_b = ggml_mul(ctx0, k, b);
-
- cb(v_b, "v_b", il);
- cb(k_b, "k_b", il);
-
- q = ggml_reshape_4d(ctx0, q, S_k, CS, n_chunks, H_k * n_seqs);
- k = ggml_reshape_4d(ctx0, k, S_k, CS, n_chunks, H_k * n_seqs);
- k_b = ggml_reshape_4d(ctx0, k_b, S_k, CS, n_chunks, H_v * n_seqs);
- v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
- v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
-
- g = ggml_reshape_4d(ctx0, g, CS, 1, n_chunks, H_v * n_seqs);
- b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
-
- // [CS, 1, n_chunks, H_v * n_seqs]
- ggml_tensor * g_cs = ggml_cumsum(ctx0, g);
- cb(g_cs, "g_cs", il);
-
- ggml_tensor * g_cs_i = g_cs;
- ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
-
- g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
-
- // [CS, CS, n_chunks, H_v * n_seqs]
- ggml_tensor * decay_mask;
- decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
- decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
- decay_mask = ggml_exp(ctx0, decay_mask);
- cb(decay_mask, "decay_mask", il);
-
- // [CS, CS, n_chunks, H_k * n_seqs]
- ggml_tensor * kb;
- kb = ggml_mul_mat(ctx0, k, k_b);
- kb = ggml_mul (ctx0, kb, decay_mask);
-
- // [CS, CS, n_chunks, H_k * n_seqs]
- ggml_tensor * attn;
- attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
-
- ggml_tensor * identity;
- identity = ggml_view_1d(ctx0, attn, CS, 0);
- identity = ggml_fill (ctx0, identity, 1.0f);
- identity = ggml_diag (ctx0, identity);
-
- ggml_tensor * lhs = ggml_add(ctx0, attn, identity);
- cb(lhs, "dnet_add_ch_lhs", il);
-
- attn = ggml_neg(ctx0, attn);
-
- ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
- attn = ggml_add(ctx0, lin_solve, identity);
- cb(attn, "dnet_add_ch_attn_solved", il); // [CS, CS, n_chunks, H_k * n_seqs]
-
- // [S_v, CS, n_chunks, H_v * n_seqs]
- v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
-
- // [CS, 1, n_chunks, H_v * n_seqs]
- ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
-
- k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
-
- // [CS, S_k, n_chunks, H_k * n_seqs]
- ggml_tensor * kbg = ggml_mul(ctx0, k_b, g_exp);
- cb(kbg, "k_beta_g_exp", il);
-
- // [S_k, CS, n_chunks, H_k * n_seqs]
- ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
- cb(k_cd, "k_cumdecay", il);
-
- // [S_k, CS, n_chunks, H_k * n_seqs]
- ggml_tensor * g_exp_t = ggml_transpose(ctx0, g_exp);
- ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
-
- // [CS, CS, n_chunks, H_k * n_seqs]
- ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- kq = ggml_mul(ctx0, kq, decay_mask);
- kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
- cb(kq, "kq", il);
-
- // vectorized calculation of key_gdiff
- // improved from the chunked version:
- // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
- // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
- // key_gdiff = key * g_diff.unsqueeze(-1)
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
- // get last element in g_cumsum along CS dimension (ne0)
- // example: [[x, y, z, ..., last], ...] -> [[last], ...]
- // [1, 1, n_chunks, H_v * n_seqs]
- ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, 1, g_cs->ne[2], g_cs->ne[3],
- g_cs->nb[1],
- g_cs->nb[2],
- g_cs->nb[3],
- ggml_row_size(g_cs->type, g_cs->ne[0] - 1));
- cb(g_last, "g_last", il);
-
- // TODO: remove this cont when CUDA supports non-cont unary ops
- g_last = ggml_cont(ctx0, g_last);
-
- // [1, 1, n_chunks, H_v * n_seqs]
- ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
- cb(g_last_exp, "g_last_exp", il);
-
- // [CS, 1, n_chunks, H_v * n_seqs]
- ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
- cb(g_diff, "g_diff", il);
-
- ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
- ggml_tensor * g_diff_exp_t = ggml_transpose(ctx0, g_diff_exp);
-
- // [S_k, CS, n_chunks, H_v * n_seqs]
- ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
- cb(kg, "key_gdiff", il);
-
- // [CS, S_k, n_chunks, H_v * n_seqs]
- ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
- cb(kg_t, "key_gdiff_t", il);
-
- ggml_tensor * s_t = ggml_transpose(ctx0, s);
- s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
- cb(s_t, "dnet_add_ch_state", il);
-
- // [CS, S_v, n_chunks, H_v * n_seqs]
- ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
-
- for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
- ggml_tensor * ch_k_cd = get_slice_2d(ctx0, k_cd, chunk); // [S_k, CS, 1, H_k * n_seqs]
- ggml_tensor * ch_v_t = get_slice_2d(ctx0, v_t, chunk); // [ CS, S_v, 1, H_v * n_seqs]
- ggml_tensor * ch_kq = get_slice_2d(ctx0, kq, chunk); // [ CS, CS, 1, H_k * n_seqs]
- ggml_tensor * ch_q_g_exp = get_slice_2d(ctx0, q_g_exp, chunk); // [S_k, CS, 1, H_k * n_seqs]
- ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
-
- // [CS, S_v, 1, H_v * n_seqs]
- ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
- cb(v_t_p, "v_prime", il);
-
- // [CS, S_v, 1, H_v * n_seqs]
- ggml_tensor * v_t_new = ggml_sub(ctx0, ch_v_t, v_t_p);
- cb(v_t_new, "v_t_new", il);
-
- // [S_v, CS, 1, H_v * n_seqs]
- ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_t_new, ch_kq);
- cb(v_attn, "v_attn", il);
-
- // [S_v, CS, 1, H_v * n_seqs]
- ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
- cb(attn_inter, "attn_inter", il);
-
- // [S_v, CS, 1, H_v * n_seqs]
- ggml_tensor * o_ch = ggml_add(ctx0, attn_inter, v_attn);
- cb(o_ch, "dnet_add_ch_attn_out", il);
-
- v = ggml_set_inplace(ctx0, v, o_ch, v->nb[1], v->nb[2], v->nb[3], chunk * v->nb[2]);
-
- // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
- // TODO: head broadcast might not work here - probably will need a transpose
- ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
-
- // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
- ggml_tensor * ch_g_last_exp = get_slice_2d(ctx0, g_last_exp, chunk);
- s_t = ggml_mul(ctx0, s_t, ch_g_last_exp);
- s_t = ggml_add(ctx0, s_t, kgv);
- cb(s_t, "dnet_add_ch_state", il);
- }
-
- s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
-
- // truncate padded tokens
- ggml_tensor * o = ggml_view_4d(ctx0, v,
- S_v, n_tokens, H_v, n_seqs,
- ggml_row_size(v->type, S_v),
- ggml_row_size(v->type, S_v * CS * n_chunks),
- ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
-
- o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
- s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
-
- return {o, s};
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
- ggml_tensor * q,
- ggml_tensor * k,
- ggml_tensor * v,
- ggml_tensor * g,
- ggml_tensor * b, // beta
- ggml_tensor * s, // state
- int il) {
- const int64_t S_k = q->ne[0];
- const int64_t H_k = q->ne[1];
- const int64_t n_tokens = q->ne[2];
- const int64_t n_seqs = q->ne[3];
-
- const int64_t S_v = v->ne[0];
- const int64_t H_v = v->ne[1];
-
- GGML_ASSERT(n_tokens == 1);
-
- GGML_ASSERT(S_k == S_v);
- GGML_ASSERT(H_v % H_k == 0);
-
- GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
- GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
- GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
-
- GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
- GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
- GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
-
- const float scale = 1.0f / sqrtf(S_k);
-
- q = ggml_scale(ctx0, q, scale);
-
- q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
- k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
- v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
-
- cb(q, "q_in", il);
- cb(k, "k_in", il);
- cb(v, "v_in", il);
- cb(b, "b_in", il);
- cb(g, "g_in", il);
-
- g = ggml_reshape_4d(ctx0, g, 1, 1, H_v, n_seqs);
- b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
-
- // [S_v, S_v, H_v, n_seqs]
- g = ggml_exp(ctx0, g);
- s = ggml_mul(ctx0, s, g);
-
- ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
-
- // [1, S_v, H_v, n_seqs]
- ggml_tensor * sk;
- sk = ggml_mul (ctx0, s_t, k);
- sk = ggml_sum_rows(ctx0, sk);
-
- // [S_v, 1, H_v, n_seqs]
- ggml_tensor * d;
- d = ggml_sub(ctx0, v, ggml_transpose(ctx0, sk));
- d = ggml_mul(ctx0, d, b);
-
- // [1, S_v, H_v, n_seqs]
- ggml_tensor * d_t;
- d_t = ggml_transpose(ctx0, d);
-
- // [S_v, S_v, H_v, n_seqs]
- ggml_tensor * kd;
- k = ggml_repeat(ctx0, k, s);
- kd = ggml_mul (ctx0, k, d_t);
-
- s_t = ggml_add(ctx0, s_t, kd);
-
- cb(s_t, "dnet_add_ar_state", il);
-
- ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
- ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
-
- o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
- s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
-
- return {o, s};
-}
-
ggml_tensor * llm_build_qwen3next::build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
ggml_tensor * beta = ggml_sigmoid(ctx0, b);
- beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
-
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
+ beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
+ gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
+
// Get convolution states from cache
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
cb(state_update_target, "state_update_target", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
- cb(conv_states_all, "conv_states_updated", il);
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ nullptr, model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
// Add shared experts if present - following Qwen3Next reference implementation
#include "models.h"
+#include "llama-memory-recurrent.h"
+
llm_build_rwkv6_base::llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model) {}
#include "models.h"
+#include "llama-memory-recurrent.h"
+
llm_build_rwkv7_base::llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model) {}
} else if (regex_expr == "\\p{AFMoE_digits}") {
// AFMOE digit pattern - use custom implementation for proper splitting
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+ } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
+ // tiny_aya digit grouping pattern from tokenizer.json:
+ // {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
+ // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
+ // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+ bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
}
return bpe_offsets;