#include <map>
#include <set>
+#include <vector>
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
+ { LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
+ { LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
+ { LLM_TENSOR_FFN_LATENT_DOWN, "blk.%d.ffn_latent_down" },
+ { LLM_TENSOR_FFN_LATENT_UP, "blk.%d.ffn_latent_up" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_CLS_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_FFN_LATENT_DOWN,
+ LLM_TENSOR_FFN_LATENT_UP,
// MoE shared expert layer
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ // Nemotron 3 Super
+ {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
return name;
}
+std::vector<llm_arch> llm_arch_all() {
+ std::vector<llm_arch> ret;
+ ret.reserve(LLM_ARCH_NAMES.size());
+ for (const auto & [arch, _] : LLM_ARCH_NAMES) {
+ ret.push_back(arch);
+ }
+ return ret;
+}
+
const char * llm_arch_name(llm_arch arch) {
auto it = LLM_ARCH_NAMES.find(arch);
if (it == LLM_ARCH_NAMES.end()) {
#include <string>
#include <set>
+#include <vector>
//
// gguf constants (sync with gguf.py)
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS,
+ LLM_KV_MOE_LATENT_SIZE,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_NUM_DEEPSTACK_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+ LLM_KV_ATTENTION_KEY_LENGTH_SWA,
+ LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_DIMENSION_COUNT_SWA,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_FFN_LATENT_DOWN,
+ LLM_TENSOR_FFN_LATENT_UP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
ggml_op op;
};
+std::vector<llm_arch> llm_arch_all();
+
const char * llm_arch_name(llm_arch arch);
llm_arch llm_arch_from_string(const std::string & name);
clear();
split_reset();
+ const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
+
auto udata = std::make_shared<llama_ubatch::data_t>();
udata->token .resize(n_tokens);
udata->embd .clear();
- udata->pos .resize(n_tokens);
+ udata->pos .resize(n_pos_all);
udata->n_seq_id .resize(n_tokens);
udata->seq_id .resize(n_tokens);
udata->seq_id_unq.resize(0);
#include "llama-memory.h"
#include "llama-mmap.h"
#include "llama-model.h"
+#include "llama-ext.h"
#include <cinttypes>
#include <cmath>
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
+ cparams.fused_gdn_ar = true;
+ cparams.fused_gdn_ch = true;
+ cparams.auto_fgdn = true;
+
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
- // intialized later
+ // initialized later
cparams.pipeline_parallel = false;
{
if (cparams.pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
+
+ if (!graph_reuse_disable) {
+ // TODO: figure out a way to make graph reuse work with pipeline parallelism
+ // ref: https://github.com/ggml-org/llama.cpp/pull/20463
+ LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
+
+ graph_reuse_disable = true;
+ }
}
sched_reserve();
if (cparams.auto_fa) {
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
if (!gf) {
- throw std::runtime_error("failed to split graph for Flash Attention check");
+ throw std::runtime_error("failed to reserve graph for Flash Attention check");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
continue;
}
- ggml_backend_dev_t device_fa = ggml_backend_get_device(
- ggml_backend_sched_get_tensor_backend(sched.get(), n));
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
break;
}
}
+
if (fa_device_mismatch) {
cparams.flash_attn = false;
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
cparams.auto_fa = false;
}
+ if (cparams.auto_fgdn) {
+ LLAMA_LOG_INFO("%s: resolving fused Gated Delta Net support:\n", __func__);
+
+ if (cparams.fused_gdn_ar) {
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
+ if (!gf) {
+ throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (autoregressive)");
+ }
+
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_AR) + 1;
+ bool gdn_device_mismatch = false;
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * n = ggml_graph_node(gf, i);
+ if (n->op != GGML_OP_GATED_DELTA_NET) {
+ continue;
+ }
+ ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_AR "-", prefix_len) == 0);
+ const int il = std::stoi(n->name + prefix_len);
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
+ if (device_gdn != device_kv) {
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
+ "is assigned to device %s (usually due to missing support)\n",
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
+ gdn_device_mismatch = true;
+ break;
+ }
+ }
+
+ if (gdn_device_mismatch) {
+ cparams.fused_gdn_ar = false;
+ LLAMA_LOG_WARN("%s: fused Gated Delta Net (autoregressive) not supported, set to disabled\n", __func__);
+ } else {
+ LLAMA_LOG_INFO("%s: fused Gated Delta Net (autoregressive) enabled\n", __func__);
+ }
+ }
+
+ if (cparams.fused_gdn_ch) {
+ // more than one token in the batch per sequence in order to take the chunked path
+ // note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
+ // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
+ // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
+ // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
+ const uint32_t n_tokens_ch = 16*n_seqs;
+ auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
+ if (!gf) {
+ throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
+ }
+
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_CH) + 1;
+ bool gdn_device_mismatch = false;
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * n = ggml_graph_node(gf, i);
+ if (n->op != GGML_OP_GATED_DELTA_NET) {
+ continue;
+ }
+ ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_CH "-", prefix_len) == 0);
+ const int il = std::stoi(n->name + prefix_len);
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
+ if (device_gdn != device_kv) {
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
+ "is assigned to device %s (usually due to missing support)\n",
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
+ gdn_device_mismatch = true;
+ break;
+ }
+ }
+
+ if (gdn_device_mismatch) {
+ cparams.fused_gdn_ch = false;
+ LLAMA_LOG_WARN("%s: fused Gated Delta Net (chunked) not supported, set to disabled\n", __func__);
+ } else {
+ LLAMA_LOG_INFO("%s: fused Gated Delta Net (chunked) enabled\n", __func__);
+ }
+ }
+
+ cparams.auto_fgdn = false;
+ }
+
// reserve worst-case graph
int n_splits_pp = -1;
int n_nodes_pp = -1;
bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
- if (n_adapters != loras->size()) {
- return false;
- }
+ // Adapters with a zero scale are never added to `loras`, so also ignore them for the comparison.
+ size_t n_non_zero = 0;
for (size_t i = 0; i < n_adapters; i ++) {
+ if (scales[i] == 0.0f) {
+ continue;
+ }
+ n_non_zero++;
+
auto it = loras->find(adapters[i]);
if (it == loras->end() || it->second != scales[i]) {
}
}
+ if (n_non_zero != loras->size()) {
+ return false;
+ }
+
return true;
}
{
//const auto t_start_us = ggml_time_us();
+ // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
res->set_inputs(&ubatch);
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
ggml_backend_sched_reset(sched.get());
- // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+ // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
gf_res_prev->reset();
// store the n_outputs as it is, and restore it afterwards
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
- if (model->hparams.n_embd_head_k % blck_size != 0) {
- LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
- __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
- return nullptr;
+ for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+ if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+ __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
+ return nullptr;
+ }
}
}
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
- if (model->hparams.n_embd_head_v % blck_size != 0) {
- LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
- __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
- return nullptr;
+ for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+ if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
+ __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
+ return nullptr;
+ }
}
}
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}
+struct ggml_cgraph * llama_graph_reserve(
+ struct llama_context * ctx,
+ uint32_t n_tokens,
+ uint32_t n_seqs,
+ uint32_t n_outputs) {
+ auto * memory = ctx->get_memory();
+ llama_memory_context_ptr mctx;
+ if (memory) {
+ mctx = memory->init_full();
+ }
+ return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
+}
+
// llama adapter API
int32_t llama_set_adapters_lora(
bool offload_kqv;
bool flash_attn;
bool auto_fa;
+ bool fused_gdn_ar; // use fused gated delta net (autoregressive)
+ bool fused_gdn_ch; // use fused gated delta net (chunked)
+ bool auto_fgdn;
bool no_perf;
bool warmup;
bool op_offload;
--- /dev/null
+#pragma once
+
+#include "llama-context.h"
+#include "ggml.h"
+#include "stdint.h"
+
+// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+LLAMA_API struct ggml_cgraph * llama_graph_reserve(
+ struct llama_context * ctx,
+ uint32_t n_tokens,
+ uint32_t n_seqs,
+ uint32_t n_outputs);
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
- uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
+ uint64_t min_times = std::stoull(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
uint64_t max_times = UINT64_MAX; // default: no max limit
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
- max_times = std::stoul(std::string(pos, int_end - pos));
+ max_times = std::stoull(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
// if there is a grammar, parse it
// rules will be empty (default) if there are parse errors
if (!parser.parse(grammar_str) || parser.rules.empty()) {
- fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+ LLAMA_LOG_ERROR("failed to parse grammar\n");
return nullptr;
}
- // Ensure that there is a "root" node.
- if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
- fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+ // Ensure that the grammar contains the start symbol
+ if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
+ LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
return nullptr;
}
continue;
}
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
- LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
return nullptr;
}
}
const bool last = (
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
- (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
+ (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token
);
for (int i = 0; i < n_tokens; ++i) {
float * data = (float *) cross_kq_mask->data;
for (int i = 0; i < n_tokens; ++i) {
+ GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
for (int j = 0; j < n_enc; ++j) {
float f = -INFINITY;
ubatch (params.ubatch),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
- n_rot (hparams.n_rot),
+ n_rot (hparams.n_rot()),
n_ctx (cparams.n_ctx),
n_head (hparams.n_head()),
n_head_kv (hparams.n_head_kv()),
- n_embd_head_k (hparams.n_embd_head_k),
+ n_embd_head_k (hparams.n_embd_head_k()),
n_embd_k_gqa (hparams.n_embd_k_gqa()),
- n_embd_head_v (hparams.n_embd_head_v),
+ n_embd_head_v (hparams.n_embd_head_v()),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w,
- ggml_tensor * cur) const {
+ ggml_tensor * cur,
+ ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
for (const auto & lora : *loras) {
res = ggml_add(ctx0, res, ab_cur);
}
+ if (w_s) {
+ res = ggml_mul(ctx0, res, w_s);
+ }
+
return res;
}
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in,
- ggml_tensor * gate_up_exps) const {
+ ggml_tensor * gate_up_exps,
+ ggml_tensor * up_exps_s,
+ ggml_tensor * gate_exps_s,
+ ggml_tensor * down_exps_s) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
n_expert_used,
type_op,
norm_w,
- scale_w,
w_scale,
gating_op,
il,
probs_in,
- gate_up_exps
+ gate_up_exps,
+ /* gate_up_exps_b */ nullptr,
+ up_exps_s,
+ gate_exps_s,
+ down_exps_s
);
}
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in,
ggml_tensor * gate_up_exps,
- ggml_tensor * gate_up_exps_b) const {
+ ggml_tensor * gate_up_exps_b,
+ ggml_tensor * up_exps_s,
+ ggml_tensor * gate_exps_s,
+ ggml_tensor * down_exps_s) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
}
- if (scale_w) {
+ if (w_scale != 0.0f && w_scale != 1.0f) {
weights = ggml_scale(ctx0, weights, w_scale);
cb(weights, "ffn_moe_weights_scaled", il);
}
cb(gate_up, "ffn_moe_gate_up_biased", il);
}
+ // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
+ if (up_exps_s) {
+ ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
+ s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+ s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+ gate_up = ggml_mul(ctx0, gate_up, s);
+ cb(gate_up, "ffn_moe_gate_up_scaled", il);
+ }
+
const int64_t n_ff = gate_up->ne[0] / 2;
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
cb(up, "ffn_moe_up_biased", il);
}
+ // apply per-expert scale2 to up
+ if (up_exps_s) {
+ ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
+ s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+ s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+ up = ggml_mul(ctx0, up, s);
+ cb(up, "ffn_moe_up_scaled", il);
+ }
+
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
cb(cur, "ffn_moe_gate_biased", il);
}
+
+ // apply per-expert scale2 to gate
+ if (gate_exps_s) {
+ ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
+ s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+ s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+ cur = ggml_mul(ctx0, cur, s);
+ cb(cur, "ffn_moe_gate_scaled", il);
+ }
}
const bool has_gate = gate_exps || gate_up_exps;
cb(experts, "ffn_moe_down_biased", il);
}
+ // apply per-expert scale2 to down
+ if (down_exps_s) {
+ ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
+ s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+ s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+ experts = ggml_mul(ctx0, experts, s);
+ cb(experts, "ffn_moe_down_scaled", il);
+ }
+
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(cur, "ffn_moe_weighted", il);
// this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur);
+ ggml_set_name(cur, "attn_scale");
res->add_input(std::move(inp));
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
// but this would make the graph topology depend on the number of output tokens, which can interere with
- // features that require constant topology such as pipline parallelism
+ // features that require constant topology such as pipeline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
//if (n_outputs < n_tokens) {
// return nullptr;
if (v_mla) {
#if 0
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
- // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+ // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
#else
}
// softmax for qwen3 reranker
- if (arch == LLM_ARCH_QWEN3) {
+ if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) {
cur = ggml_soft_max(ctx0, cur);
}
} break;
ggml_tensor * cur,
int il) const;
- // do mat_mul, while optionally apply lora
+ // do mat_mul, while optionally apply lora and per-tensor scale
ggml_tensor * build_lora_mm(
ggml_tensor * w,
- ggml_tensor * cur) const;
+ ggml_tensor * cur,
+ ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
ggml_tensor * build_lora_mm_id(
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in = nullptr,
- ggml_tensor * gate_up_exps = nullptr) const;
+ ggml_tensor * gate_up_exps = nullptr,
+ ggml_tensor * up_exps_s = nullptr,
+ ggml_tensor * gate_exps_s = nullptr,
+ ggml_tensor * down_exps_s = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
- bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in = nullptr,
ggml_tensor * gate_up_exps = nullptr,
- ggml_tensor * gate_up_exps_b = nullptr) const;
+ ggml_tensor * gate_up_exps_b = nullptr,
+ ggml_tensor * up_exps_s = nullptr,
+ ggml_tensor * gate_exps_s = nullptr,
+ ggml_tensor * down_exps_s = nullptr) const;
//
// inputs
return n_head/n_head_kv;
}
+uint32_t llama_hparams::n_rot(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_rot_swa : n_rot_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
uint32_t llama_hparams::n_embd_inp() const {
uint32_t n_embd_inp = n_embd;
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
+uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
- return n_embd_head_k * n_head_kv;
+ return n_embd_head_k(il) * n_head_kv;
}
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
- return n_embd_head_v * n_head_kv;
+ return n_embd_head_v(il) * n_head_kv;
}
bool llama_hparams::is_n_embd_k_gqa_variable() const {
}
uint32_t llama_hparams::n_embd_head_k_mla() const {
- return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+ return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
}
uint32_t llama_hparams::n_embd_head_v_mla() const {
- return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+ return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
}
bool llama_hparams::has_kv(uint32_t il) const {
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
- uint32_t n_rot;
- uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
- uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ // different head size for full_attention and SWA layers
+ uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+ uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
+ uint32_t n_embd_head_k_swa;
+ uint32_t n_embd_head_v_swa;
+
+ // different RoPE dimensions for full_attention and SWA layers
+ uint32_t n_rot_full;
+ uint32_t n_rot_swa;
+
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla_impl = 0;
uint32_t n_embd_head_v_mla_impl = 0;
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
+ uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
float f_norm_eps;
uint32_t n_gqa(uint32_t il = 0) const;
+ uint32_t n_rot(uint32_t il = 0) const;
+
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
+ // dimension of key/value embeddings for each head (per layer)
+ uint32_t n_embd_head_k(uint32_t il = 0) const;
+ uint32_t n_embd_head_v(uint32_t il = 0) const;
+
// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
char buf[256];
- snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+ snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
for (size_t i = 1; i < ne.size(); i++) {
- snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
}
return buf;
}
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
-#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
+#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
+#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
+#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"
break;
}
- // remeber the position that we found
+ // remember the position that we found
res.push_back(sinfo_new);
// store the old state of the cells in the recovery stack
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
return ggml_view_4d(ctx, k,
- hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
- ggml_row_size(k->type, hparams.n_embd_head_k),
+ hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(k->type, hparams.n_embd_head_k(il)),
ggml_row_size(k->type, n_embd_k_gqa),
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
if (!v_trans) {
// note: v->nb[1] <= v->nb[2]
return ggml_view_4d(ctx, v,
- hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
- ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+ hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
// note: v->nb[1] > v->nb[2]
return ggml_view_4d(ctx, v,
- n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
- ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, kv_size), // v->nb[2]
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
}
for (uint32_t s = 0; s < n_stream; ++s) {
- // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+ // bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
- float freq_scale) const {
+ float freq_scale,
+ uint32_t il) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
- const auto & n_rot = hparams.n_rot;
+ const auto & n_rot = hparams.n_rot(il);
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
- const auto & n_embd_head_k = hparams.n_embd_head_k;
- //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
- const auto & n_rot = hparams.n_rot;
-
- const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
-
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const auto n_rot = hparams.n_rot(il);
+ const auto n_embd_head_k = hparams.n_embd_head_k(il);
+ const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope));
- ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+ ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
ggml_build_forward_expand(gf, cur);
}
io.write(&pos, sizeof(pos));
io.write(&n_seq_id, sizeof(n_seq_id));
- // TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
- // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
+ if (hparams.n_pos_per_embd() > 1) {
+ const llama_kv_cell_ext ext = cells.ext_get(i);
+ io.write(&ext, sizeof(ext));
+ }
for (const auto & seq_id : seq_ids) {
io.write(&seq_id, sizeof(seq_id));
return false;
}
+ if (hparams.n_pos_per_embd() > 1) {
+ llama_kv_cell_ext ext;
+ io.read_to(&ext, sizeof(ext));
+
+ ubatch.pos[i + ubatch.n_tokens] = ext.y;
+ ubatch.pos[i + ubatch.n_tokens*2] = ext.x;
+ }
+
// read the sequence id, but directly discard it - we will use dest_seq_id instead
{
llama_seq_id seq_id;
cells.pos_set(i, pos);
+ if (hparams.n_pos_per_embd() > 1) {
+ llama_kv_cell_ext ext;
+ io.read_to(&ext, sizeof(ext));
+ cells.ext_set(i, ext);
+ }
+
for (uint32_t j = 0; j < n_seq_id; ++j) {
llama_seq_id seq_id;
io.read_to(&seq_id, sizeof(seq_id));
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
- float freq_scale) const;
+ float freq_scale,
+ uint32_t il) const;
ggml_cgraph * build_graph_shift(
llm_graph_result * res,
#include "llama-model-loader.h"
+#include "ggml-alloc.h"
#include "ggml.h"
+#include "gguf.h"
+#include "llama-hparams.h"
#include <algorithm>
#include <array>
#include <cinttypes>
+#include <cstdint>
#include <cstring>
#include <future>
+#include <regex>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
+ case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
- const int kid = gguf_find_key(meta.get(), key.c_str());
+ const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
}
struct GGUFMeta::ArrayInfo arr_info =
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
result = arr_info.length;
template<typename T>
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
- const gguf_context * ctx = meta.get();
+ const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
- const gguf_context * ctx = meta.get();
+ const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
- const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
+ const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
// get array of n <= N_MAX elements, or a single element repeated n times
template<typename T, size_t N_MAX>
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
- const int kid = gguf_find_key(meta.get(), key.c_str());
+ const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
}
- if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
+ if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
struct GGUFMeta::ArrayInfo arr_info =
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
if (n != arr_info.length) {
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
- const int id = gguf_find_key(meta.get(), key.c_str());
+ const int id = gguf_find_key(metadata, key.c_str());
if (id < 0) {
if (required) {
}
// throw and error if type is an array
- if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+ if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
llama_model_loader::llama_model_loader(
+ struct gguf_context * meta,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
- const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
+ : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
tensor_buft_overrides = param_tensor_buft_overrides_p;
- // Load the main GGUF
- struct ggml_context * ctx = NULL;
- struct gguf_init_params params = {
- /*.no_alloc = */ true,
- /*.ctx = */ &ctx,
- };
-
- meta.reset(gguf_init_from_file(fname.c_str(), params));
- if (!meta) {
- throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
- }
-
- get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
- llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-
- files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
- contexts.emplace_back(ctx);
+ if (!fname.empty()) {
+ // Load the main GGUF
+ struct ggml_context * ctx = NULL;
+ struct gguf_init_params params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
- if (use_mmap && use_direct_io) {
- if (files.back()->has_direct_io()) {
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
- use_mmap = false;
- } else {
- LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
- use_direct_io = false;
-
- // reopen file using std::fopen for mmap
- files.pop_back();
- files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
+ metadata = metadata_ptr.get();
+ if (metadata == nullptr) {
+ throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
}
- }
- // Save tensors data offset of the main file.
- // For subsidiary files, `meta` tensor data offset must not be used,
- // so we build a unified tensors index for weights.
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
- std::string tensor_name = std::string(cur->name);
- // make sure there is no duplicated tensor names
- if (weights_map.find(tensor_name) != weights_map.end()) {
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
- }
- n_elements += ggml_nelements(cur);
- n_bytes += ggml_nbytes(cur);
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
- }
- uint16_t n_split = 0;
- get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
- // Load additional GGML contexts
- if (n_split > 1) {
- // make sure the main file is loaded first
- uint16_t idx = 0;
- const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
- get_key(kv_split_no, idx);
- if (idx != 0) {
- throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
- }
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+ contexts.emplace_back(ctx);
- // generate list of splits if needed
- if (splits.empty()) {
- splits = llama_get_list_splits(fname, idx, n_split);
- }
+ if (use_mmap && use_direct_io) {
+ if (files.back()->has_direct_io()) {
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ use_mmap = false;
+ } else {
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+ use_direct_io = false;
- // in case user give a custom list of splits, check if it matches the expected number
- if (n_split != (uint16_t)splits.size()) {
- throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+ // reopen file using std::fopen for mmap
+ files.pop_back();
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ }
}
- if (trace > 0) {
- LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
- }
+ // Save tensors data offset of the main file.
+ // For subsidiary files, `meta` tensor data offset must not be used,
+ // so we build a unified tensors index for weights.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
+ }
+ uint16_t n_split = 0;
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+ // Load additional GGML contexts
+ if (n_split > 1) {
+ // make sure the main file is loaded first
+ uint16_t idx = 0;
+ const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
+ get_key(kv_split_no, idx);
+ if (idx != 0) {
+ throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+ }
- // load other splits
- for (idx = 1; idx < n_split; idx++) {
- const char * fname_split = splits[idx].c_str();
+ // generate list of splits if needed
+ if (splits.empty()) {
+ splits = llama_get_list_splits(fname, idx, n_split);
+ }
- struct gguf_init_params split_params = {
- /*.no_alloc = */ true,
- /*.ctx = */ &ctx,
- };
- gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
- if (!ctx_gguf) {
- throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
+ // in case user give a custom list of splits, check if it matches the expected number
+ if (n_split != (uint16_t)splits.size()) {
+ throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
}
- // check idx
- {
- const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
- if (kid < 0) {
- throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+ if (trace > 0) {
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+ }
+
+ // load other splits
+ for (idx = 1; idx < n_split; idx++) {
+ const char * fname_split = splits[idx].c_str();
+
+ struct gguf_init_params split_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
+ gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+ if (!ctx_gguf) {
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
}
- int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
- if (idx_gguf != idx) {
- throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+
+ // check idx
+ {
+ const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+ if (kid < 0) {
+ throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+ }
+ int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+ if (idx_gguf != idx) {
+ throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+ }
}
- }
- files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
- contexts.emplace_back(ctx);
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+ contexts.emplace_back(ctx);
- // Save tensors data offset info of the shard.
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
- std::string tensor_name = std::string(cur->name);
- // make sure there is no duplicated tensor names
- if (weights_map.find(tensor_name) != weights_map.end()) {
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ // Save tensors data offset info of the shard.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
- n_elements += ggml_nelements(cur);
- n_bytes += ggml_nbytes(cur);
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
- }
- get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
- // sanity check
- {
- const int n_tensors_loaded = (int) weights_map.size();
- if (n_tensors != n_tensors_loaded) {
- throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+ // sanity check
+ {
+ const int n_tensors_loaded = (int) weights_map.size();
+ if (n_tensors != n_tensors_loaded) {
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+ }
}
- }
- LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
+ }
+ } else {
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
}
- n_kv = gguf_get_n_kv(meta.get());
+ n_kv = gguf_get_n_kv(metadata);
n_tensors = weights_map.size();
- fver = (enum llama_fver) gguf_get_version(meta.get());
+ fver = (enum llama_fver) gguf_get_version(metadata);
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
+ case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
- const char * name = gguf_get_key(meta.get(), i);
- const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
+ const char * name = gguf_get_key(metadata, i);
+ const enum gguf_type type = gguf_get_kv_type(metadata, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
- ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
: gguf_type_name(type);
- std::string value = gguf_kv_to_str(meta.get(), i);
+ std::string value = gguf_kv_to_str(metadata, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
return cur;
}
-struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
- LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+ GGML_ASSERT(w != nullptr);
+
+ if (op == GGML_OP_NONE) {
+ return true;
+ }
+
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
+ if (!ctx_ptr) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+ ggml_context * ctx = ctx_ptr.get();
+
+ ggml_tensor * op_tensor = nullptr;
+
+ switch (op) {
+ case GGML_OP_GET_ROWS:
+ {
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_get_rows(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT:
+ {
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul_mat(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT_ID:
+ {
+ const int n_expert_used = hparams.n_expert_used;
+ GGML_ASSERT(n_expert_used > 0);
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+ } break;
+ case GGML_OP_ADD:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_add(ctx, a, w);
+ } break;
+ case GGML_OP_ADD_ID:
+ {
+ const int n_expert_used = hparams.n_expert_used;
+ GGML_ASSERT(n_expert_used > 0);
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_add_id(ctx, a, w, c);
+ } break;
+ case GGML_OP_MUL:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul(ctx, a, w);
+ } break;
+ case GGML_OP_DIV:
+ {
+ ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+ op_tensor = ggml_div(ctx, a, w);
+ } break;
+ case GGML_OP_ROPE:
+ {
+ const int n_embd_head = hparams.n_embd_head_v();
+ const int n_head = hparams.n_head();
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_rope_ext(
+ ctx, a, b, w,
+ 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+ );
+
+ } break;
+ case GGML_OP_SSM_CONV:
+ {
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+ } break;
+ case GGML_OP_SSM_SCAN:
+ {
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
+ const int64_t n_head = w->ne[1];
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
+ } break;
+ case GGML_OP_RWKV_WKV6:
+ {
+ // FIXME
+ const int64_t S = 123;
+ const int64_t H = 123;
+ const int64_t n_tokens = 123;
+ const int64_t n_seqs = 123;
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * tf = w;
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+ op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+ } break;
+ case GGML_OP_IM2COL:
+ {
+ const int n_embd_inp = hparams.n_embd_inp();
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+ } break;
+ case GGML_OP_SCALE:
+ {
+ op_tensor = ggml_scale(ctx, w, 1.0f);
+ } break;
+ default:
+ GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+ }
+
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+ GGML_ASSERT(w->buffer == nullptr);
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+ ggml_backend_buffer_free(w->buffer);
+ w->buffer = nullptr;
+
+ return op_supported;
+}
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
+ GGML_ASSERT(!buft_list->empty());
+ for (const auto & cur : *buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+ return cur_buft;
+ }
+ }
+
+ return nullptr;
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor(
+ const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+ const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ // one ggml context per buffer type
+ int max_n_tensors = n_tensors;
+ max_n_tensors += 1; // duplicated output tensor
+ max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
+ if (files.empty()) {
+ max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
+ }
+ const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+ ggml_init_params params = {
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ctx_map.emplace(buft, ctx);
+
+ return ctx;
+ }
+ return it->second.get();
+ };
+
+ auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
+ if (!t_meta) {
+ if (flags & TENSOR_NOT_REQUIRED) {
+ return nullptr;
+ }
+ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+ }
+
+ // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+ // the tensor is duplicated
+ // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+ llm_tensor tn_tensor = tn.tensor;
+ if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
+ tn_tensor = LLM_TENSOR_OUTPUT;
+ }
+
+ llm_tensor_info info;
+ try {
+ info = llm_tensor_info_for(tn_tensor);
+ } catch (const std::out_of_range & e) {
+ throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+ }
+
+ // skip unused tensors
+ if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
+ const size_t nbytes = ggml_nbytes(t_meta);
+ LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
+
+ size_data -= nbytes;
+ n_created++;
+
+ return nullptr;
+ }
+
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
+ ggml_op op;
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+ if (bias) {
+ if (info.op == GGML_OP_MUL_MAT_ID) {
+ op = GGML_OP_ADD_ID;
+ } else {
+ op = GGML_OP_ADD;
+ }
+ } else {
+ op = info.op;
+ }
+
+ // sanity checks
+ if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+ if (tn.bid != -1) {
+ GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+ }
+ } else {
+ if (tn.bid == -1) {
+ GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+ }
+ }
+
+ // select the buffer type for this tensor
+ const buft_list_t * buft_list;
+ switch (info.layer) {
+ case LLM_TENSOR_LAYER_INPUT:
+ buft_list = buft_list_input;
+ break;
+ case LLM_TENSOR_LAYER_OUTPUT:
+ buft_list = buft_list_output;
+ break;
+ case LLM_TENSOR_LAYER_REPEATING:
+ GGML_ASSERT(buft_list_layer != nullptr);
+ buft_list = buft_list_layer;
+ break;
+ default:
+ GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+ }
+
+ ggml_backend_buffer_type_t buft = nullptr;
+
+ // check overrides
+ if (tensor_buft_overrides) {
+ std::string tensor_name = tn.str();
+ for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+ std::regex pattern(overrides->pattern);
+ if (std::regex_search(tensor_name, pattern)) {
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
+ // when overriding to a CPU buffer, consider the extra buffer types
+ buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
+ } else {
+ buft = overrides->buft;
+ }
+
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+ tensor_name.c_str(),
+ ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+ ggml_backend_buft_name(buft));
+ break;
+ }
+ }
+ }
+
+ if (!buft) {
+ buft = select_weight_buft(hparams, t_meta, op, buft_list);
+ if (!buft) {
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+ }
+ }
+
+ // avoid using a host buffer when using mmap
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
+ if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ throw std::runtime_error("no CPU backend found");
+ }
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
+ }
+
+ if (buft != buft_list->front().second) {
+ if (n_tensors_moved == 0) {
+ first_tensor_moved_name = t_meta->name;
+ first_tensor_moved_type_name = ggml_type_name(t_meta->type);
+ first_moved_from_buft = buft_list->front().second;
+ first_moved_to_buft = buft;
+ }
+ n_tensors_moved++;
+ }
+
+ return buft;
+ };
+
+ if (files.empty()) {
+ if (flags & TENSOR_SKIP_IF_VIRTUAL) {
+ return nullptr;
+ }
+ ggml_type type = GGML_TYPE_F32;
+ const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
+ if (tid != -1) {
+ type = gguf_get_tensor_type(metadata, tid);
+ }
+
+ // for tensors that are not required some of the dimensions can be invalid:
+ if (flags & TENSOR_NOT_REQUIRED) {
+ for (size_t dim = 0; dim < ne.size(); dim++) {
+ if (ne.begin()[dim] <= 0) {
+ return nullptr;
+ }
+ }
+ }
+
+ ggml_tensor t_meta;
+ memset(&t_meta, 0, sizeof(ggml_tensor));
+ t_meta.type = type;
+ for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
+ t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
+ GGML_ASSERT(t_meta.ne[dim] >= 1);
+ t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
+ GGML_ASSERT(t_meta.nb[dim] >= 1);
+ }
+ ggml_set_name(&t_meta, tn.str().c_str());
+
+ ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
+ GGML_ASSERT(buft != nullptr);
+ ggml_context * ctx = ctx_for_buft(buft);
+ ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
+ ggml_set_name(ret, tn.str().c_str());
+ return ret;
+ }
+
+ ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
+ ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
+ if (buft == nullptr) {
+ return nullptr; // return type is ggml_tensor *
+ }
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+ if (flags & TENSOR_DUPLICATED) {
+ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+ if (t) {
+ return t;
+ }
+ }
+
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
+ const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) {
return NULL;
}
- bool duplicated = flags & TENSOR_DUPLICATED;
+ const bool duplicated = flags & TENSOR_DUPLICATED;
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur));
}
return tensor;
-
}
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
if (n_created != n_tensors) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
+ if (n_tensors_moved > 0) {
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
+ __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+ }
}
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
+ if (files.empty()) {
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+ set_tensor_data(t, set_tensor_data_ud);
+ }
+ return true;
+ }
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
std::vector<no_init<uint8_t>> read_buf;
#include "llama-impl.h"
#include "llama-arch.h"
+#include "llama-hparams.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <cstddef>
+#include <cstring>
#include <map>
#include <stdexcept>
#include <unordered_map>
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
}
};
- static const int TENSOR_NOT_REQUIRED = 1 << 0;
- static const int TENSOR_DUPLICATED = 1 << 1;
- static const int TENSOR_SKIP = 1 << 2;
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
+ static const int TENSOR_DUPLICATED = 1 << 1;
+ static const int TENSOR_SKIP = 1 << 2;
+ static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
int n_kv = 0;
int n_tensors = 0;
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
const llama_model_tensor_buft_override * tensor_buft_overrides;
- gguf_context_ptr meta;
+ gguf_context_ptr metadata_ptr;
+ struct gguf_context * metadata; // either metadata_ptr.get() or externally set
+ llama_model_set_tensor_data_t set_tensor_data;
+ void * set_tensor_data_ud;
std::vector<ggml_context_ptr> contexts;
std::string arch_name;
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+ struct ggml_backend_buft_comparator {
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+ }
+ };
+
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+ // track tensors that had to be moved for debugging:
+ size_t n_tensors_moved = 0;
+ std::string first_tensor_moved_name;
+ std::string first_tensor_moved_type_name;
+ ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+ ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
+
llama_model_loader(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+ struct ggml_tensor * create_tensor(
+ const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+ const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
#include "llama-model.h"
#include "llama-vocab.h"
+#include <cstdint>
#include <string>
-llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
- gguf_ctx = gguf_init_empty();
-}
+llama_model_saver::llama_model_saver(const struct llama_model * model) :
+ gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
+
+llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
+ gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
llama_model_saver::~llama_model_saver() {
- gguf_free(gguf_ctx);
+ if (gguf_ctx_owned) {
+ gguf_free(gguf_ctx);
+ }
}
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
template <typename Container>
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
- const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+ GGML_ASSERT(model != nullptr || !per_layer);
+ const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
GGML_ASSERT(n_values <= value.size());
if (n_values == 0) {
GGML_ABORT("fatal error");
}
}
+// instantiate for external usage:
+template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
std::vector<const char *> tmp(value.size());
}
void llama_model_saver::add_kv_from_model() {
- const llama_hparams & hparams = model.hparams;
- const llama_vocab & vocab = model.vocab;
+ const llama_hparams & hparams = model->hparams;
+ const llama_vocab & vocab = model->vocab;
const int32_t n_vocab = vocab.n_tokens();
std::vector<std::string> tokens(n_vocab);
std::vector<float> scores(n_vocab);
std::vector<int32_t> token_types(n_vocab);
- for (int32_t id = 0; id < n_vocab; ++id) {
- const llama_vocab::token_data & token_data = vocab.get_token_data(id);
-
- tokens[id] = token_data.text;
- scores[id] = token_data.score;
-
- switch(token_data.attr) {
- case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
- case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
- case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
- case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
- case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
- case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
- case LLAMA_TOKEN_ATTR_UNDEFINED:
- default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
+ if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
+ for (int32_t id = 0; id < n_vocab; ++id) {
+ const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+ tokens[id] = token_data.text;
+ scores[id] = token_data.score;
+
+ switch(token_data.attr) {
+ case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
+ case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
+ case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
+ case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
+ case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+ case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
+ case LLAMA_TOKEN_ATTR_UNDEFINED:
+ default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
+ }
}
}
// add_kv(LLM_KV_GENERAL_TYPE, ???);
- add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
+ add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
- add_kv(LLM_KV_GENERAL_NAME, model.name);
+ add_kv(LLM_KV_GENERAL_NAME, model->name);
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
// add_kv(LLM_KV_GENERAL_VERSION, ???);
// add_kv(LLM_KV_GENERAL_URL, ???);
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
- add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
- add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
- add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
}
void llama_model_saver::add_tensors_from_model() {
- if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
- add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+ if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
+ add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
}
- add_tensor(model.type_embd);
- add_tensor(model.pos_embd);
- add_tensor(model.tok_norm);
- add_tensor(model.tok_norm_b);
- add_tensor(model.output_norm);
- add_tensor(model.output_norm_b);
- add_tensor(model.output);
- add_tensor(model.output_b);
- add_tensor(model.output_norm_enc);
- add_tensor(model.cls);
- add_tensor(model.cls_b);
- add_tensor(model.cls_out);
- add_tensor(model.cls_out_b);
- add_tensor(model.cls_norm);
-
- for (const struct llama_layer & layer : model.layers) {
+ add_tensor(model->type_embd);
+ add_tensor(model->pos_embd);
+ add_tensor(model->tok_norm);
+ add_tensor(model->tok_norm_b);
+ add_tensor(model->output_norm);
+ add_tensor(model->output_norm_b);
+ add_tensor(model->output);
+ add_tensor(model->output_b);
+ add_tensor(model->output_norm_enc);
+ add_tensor(model->cls);
+ add_tensor(model->cls_b);
+ add_tensor(model->cls_out);
+ add_tensor(model->cls_out_b);
+ add_tensor(model->cls_norm);
+
+ for (const struct llama_layer & layer : model->layers) {
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
}
#pragma once
+#include "gguf.h"
#include "llama.h"
#include "llama-arch.h"
struct llama_model_saver {
struct gguf_context * gguf_ctx = nullptr;
- const struct llama_model & model;
+ const bool gguf_ctx_owned;
+ const struct llama_model * model;
const struct LLM_KV llm_kv;
- llama_model_saver(const struct llama_model & model);
+ llama_model_saver(const struct llama_model * model);
+ llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
~llama_model_saver();
void add_kv(enum llm_kv key, uint32_t value);
#include "llama-model.h"
+#include "ggml.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-cparams.h"
#include <algorithm>
#include <cassert>
#include <cfloat>
+#include <cstdint>
#include <cstring>
#include <cmath>
#include <functional>
case LLM_TYPE_0_3B: return "0.3B";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
+ case LLM_TYPE_0_8B: return "0.8B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_2B: return "1.2B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
+ case LLM_TYPE_120B_A12B: return "120B.A12B";
+ case LLM_TYPE_122B_A10B: return "122B.A10B";
case LLM_TYPE_196B_A11B: return "196B.A11B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
case LLM_TYPE_310B_A15B: return "310B.A15B";
case LLM_TYPE_355B_A32B: return "355B.A32B";
+ case LLM_TYPE_397B_A17B: return "397B.A17B";
case LLM_TYPE_744B_A40B: return "744B.A40B";
case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B";
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
}
-// checks if the weight tensor can be used with the specified buffer type and device
-static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
- GGML_ASSERT(w != nullptr);
-
- if (op == GGML_OP_NONE) {
- return true;
- }
-
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context_ptr ctx_ptr { ggml_init(params) };
- if (!ctx_ptr) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ggml_context * ctx = ctx_ptr.get();
-
- ggml_tensor * op_tensor = nullptr;
-
- switch (op) {
- case GGML_OP_GET_ROWS:
- {
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_get_rows(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT:
- {
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
- op_tensor = ggml_mul_mat(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
- } break;
- case GGML_OP_ADD:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_add(ctx, a, w);
- } break;
- case GGML_OP_ADD_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_add_id(ctx, a, w, c);
- } break;
- case GGML_OP_MUL:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_mul(ctx, a, w);
- } break;
- case GGML_OP_DIV:
- {
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
- op_tensor = ggml_div(ctx, a, w);
- } break;
- case GGML_OP_ROPE:
- {
- int n_embd_head = hparams.n_embd_head_v;
- int n_head = hparams.n_head();
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_rope_ext(
- ctx, a, b, w,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0
- );
-
- } break;
- case GGML_OP_SSM_CONV:
- {
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 3;
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
- } break;
- case GGML_OP_SSM_SCAN:
- {
- // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
- const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
- const int64_t n_head = w->ne[1];
- const int64_t head_dim = hparams.ssm_d_inner / n_head;
- const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 3;
- ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
- ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
- ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
- ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
- ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
- } break;
- case GGML_OP_RWKV_WKV6:
- {
- // FIXME
- const int64_t S = 123;
- const int64_t H = 123;
- const int64_t n_tokens = 123;
- const int64_t n_seqs = 123;
- ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * tf = w;
- ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
- } break;
- case GGML_OP_IM2COL:
- {
- const int n_embd_inp = hparams.n_embd_inp();
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
- } break;
- case GGML_OP_SCALE:
- {
- op_tensor = ggml_scale(ctx, w, 1.0f);
- } break;
- default:
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
- }
-
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
- GGML_ASSERT(w->buffer == nullptr);
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
- ggml_backend_buffer_free(w->buffer);
- w->buffer = nullptr;
-
- return op_supported;
-}
-
-// lists of buffer types used for each layer
-using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
-
-// find the first buffer type in the list that can use the tensor
-static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
- GGML_ASSERT(!buft_list.empty());
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
- return cur_buft;
- }
- }
-
- return nullptr;
-}
-
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
buft_list_t buft_list;
}
void llama_model::load_hparams(llama_model_loader & ml) {
- const gguf_context * ctx = ml.meta.get();
+ const gguf_context * ctx = ml.metadata;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+ hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+ hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
// sanity check for n_rot (optional)
- hparams.n_rot = hparams.n_embd_head_k;
+ hparams.n_rot_full = hparams.n_embd_head_k_full;
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
- if (hparams.n_rot != hparams.n_embd_head_k) {
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+ if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
}
}
} else {
- hparams.n_rot = 0;
- hparams.n_embd_head_k = 0;
- hparams.n_embd_head_v = 0;
+ hparams.n_rot_full = 0;
+ hparams.n_embd_head_k_full = 0;
+ hparams.n_embd_head_v_full = 0;
+ }
+
+ // head size and n_rot for SWA layers
+ {
+ hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
+ hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
+
+ hparams.n_rot_swa = hparams.n_rot_full;
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
}
// for differentiating model types
hparams.n_attn_temp_floor_scale = 8192;
hparams.f_attn_temp_scale = 0.1f;
hparams.f_attn_temp_offset = 1.0f;
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
+ uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_AFMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
{
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- uint32_t swa_period = 3;
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ uint32_t swa_period = 3;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period, true);
} else {
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
hparams.f_max_alibi_bias = 8.0f;
case LLM_ARCH_JINA_BERT_V3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
case LLM_ARCH_NEO_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 28) {
type = LLM_TYPE_250M;
case LLM_ARCH_EUROBERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 12) {
type = LLM_TYPE_SMALL; // 0.2B
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_7B; break;
break;
default: type = LLM_TYPE_UNKNOWN;
}
-
- // Load attention parameters
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
} break;
case LLM_ARCH_PLAMO3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- uint32_t swa_period = 8;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ uint32_t swa_period = 8;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
} else {
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096; // default value of gemma 2
- hparams.set_swa_pattern(2);
+ uint32_t swa_period = 2;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.attn_soft_cap = true;
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
hparams.f_attention_scale = type == LLM_TYPE_27B
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_GEMMA3:
{
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(6);
+ uint32_t swa_period = 6;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
hparams.f_attention_scale = type == LLM_TYPE_27B
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_GEMMA3N:
{
+ uint32_t swa_period = 5;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(5);
+ hparams.set_swa_pattern(swa_period);
hparams.n_layer_kv_from_start = 20;
hparams.f_attention_scale = 1.0f;
case LLM_ARCH_GEMMA_EMBEDDING:
{
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
- hparams.set_swa_pattern(6);
+ uint32_t swa_period = 6;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.causal_attn = false; // embeddings do not use causal attention
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
//applied only if model converted with --sentence-transformers-dense-modules
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
case 24: type = LLM_TYPE_0_3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
- hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_STARCODER2:
}
switch (hparams.n_layer) {
- // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+ // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
case 12: // 900M 8x???M
case 32: // 51B 16x?B
default: type = LLM_TYPE_UNKNOWN;
} break;
case LLM_ARCH_COMMAND_R:
{
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_35B; break;
case LLM_ARCH_COHERE2:
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
case LLM_ARCH_DEEPSEEK:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
switch (hparams.n_ff_exp) {
case 1408: type = LLM_TYPE_16B; break;
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
if (!is_lite) {
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
}
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// Expert gating function (GLM-4.5 uses sigmoid)
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// deepseek MLA parameters
case LLM_ARCH_JAIS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_1_3B; break;
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
switch (hparams.n_layer) {
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
case 56: type = LLM_TYPE_9B; break;
+ case 88: type = LLM_TYPE_120B_A12B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
if (hparams.n_layer == 64) { // 32B
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 128;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
// Granite uses rope_finetuned as a switch for rope, so default to true
bool rope_finetuned = true;
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
- ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_7B; break;
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
} break;
case LLM_ARCH_BAILINGMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
switch (hparams.n_layer) {
case LLM_ARCH_BAILINGMOE2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
case LLM_ARCH_DOTS1:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
switch (hparams.n_layer) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
}
switch (hparams.n_layer) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_A13B; break;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(2);
+ uint32_t swa_period = 2;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
{
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.n_swa = 4096;
- hparams.set_swa_pattern(4, true);
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 4096;
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period, true);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_GROVEMOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp, false);
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
}
switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_2B; break;
+ case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
+ case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
+ case 64: type = LLM_TYPE_27B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
}
switch (hparams.n_layer) {
- case 28: type = LLM_TYPE_35B_A3B; break;
- case 48: type = LLM_TYPE_80B_A3B; break;
+ case 40: type = LLM_TYPE_35B_A3B; break;
+ case 48: type = LLM_TYPE_122B_A10B; break;
+ case 60: type = LLM_TYPE_397B_A17B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
switch (hparams.n_layer) {
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
// MoE parameters - Kimi uses moe_intermediate_size = 1024
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
switch (hparams.n_layer) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ // full_attention layer only use half of the RoPE dimensions
+ hparams.n_rot_full = hparams.n_rot_full / 2;
+
// MoE + SWA parameters
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
default: type = LLM_TYPE_UNKNOWN;
}
} break;
- default: throw std::runtime_error("unsupported model architecture");
+ default: throw std::runtime_error("unsupported model architecture: " + arch_name());
}
pimpl->n_bytes = ml.n_bytes;
// assign the output layer
pimpl->dev_output = get_layer_buft_list(n_layer);
- // one ggml context per buffer type
- int max_n_tensors = ml.n_tensors;
- max_n_tensors += 1; // duplicated output tensor
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
-
- // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
- struct ggml_backend_buft_comparator {
- bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
- return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
- }
- };
- std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
-
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
- auto it = ctx_map.find(buft);
- if (it == ctx_map.end()) {
- ggml_init_params params = {
- /*.mem_size =*/ ctx_size,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- ggml_context * ctx = ggml_init(params);
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
-
- ctx_map.emplace(buft, ctx);
-
- return ctx;
- }
- return it->second.get();
- };
-
- const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
- const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
- const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
+ const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
// create tensors for the weights
{
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_token_types = vocab.n_token_types();
- const int64_t n_rot = hparams.n_rot;
+ const int64_t n_rot = hparams.n_rot();
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
const int64_t n_ctx_train = hparams.n_ctx_train;
throw std::runtime_error("model has expert layers but no expert layers are used");
}
- int n_moved_tensors = 0;
- ggml_tensor * first_moved_tensor = nullptr;
- ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
- ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
-
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
- if (!t_meta) {
- if (flags & TENSOR_NOT_REQUIRED) {
- return nullptr;
- }
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
- }
-
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
- // the tensor is duplicated
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
- llm_tensor tn_tensor = tn.tensor;
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
- tn_tensor = LLM_TENSOR_OUTPUT;
- }
-
- llm_tensor_info info;
- try {
- info = llm_tensor_info_for(tn_tensor);
- } catch (const std::out_of_range & e) {
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
- }
-
- // skip unused tensors
- if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
- const size_t nbytes = ggml_nbytes(t_meta);
- LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
-
- ml.size_data -= nbytes;
- ml.n_created++;
-
- return nullptr;
- }
-
- // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
- ggml_op op;
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
- if (bias) {
- if (info.op == GGML_OP_MUL_MAT_ID) {
- op = GGML_OP_ADD_ID;
- } else {
- op = GGML_OP_ADD;
- }
- } else {
- op = info.op;
- }
-
- // sanity checks
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
- if (tn.bid != -1) {
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
- }
- } else {
- if (tn.bid == -1) {
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
- }
- }
-
- // select the buffer type for this tensor
- buft_list_t * buft_list;
- switch (info.layer) {
- case LLM_TENSOR_LAYER_INPUT:
- buft_list = pimpl->dev_input.buft_list;
- break;
- case LLM_TENSOR_LAYER_OUTPUT:
- buft_list = pimpl->dev_output.buft_list;
- break;
- case LLM_TENSOR_LAYER_REPEATING:
- buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
- break;
- default:
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
- }
-
- ggml_backend_buffer_type_t buft = nullptr;
-
- // check overrides
- if (ml.tensor_buft_overrides) {
- std::string tensor_name = tn.str();
- for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
- std::regex pattern(overrides->pattern);
- if (std::regex_search(tensor_name, pattern)) {
- if (overrides->buft == ggml_backend_cpu_buffer_type()) {
- // when overriding to a CPU buffer, consider the extra buffer types
- buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
- } else {
- buft = overrides->buft;
- }
-
- LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
- tensor_name.c_str(),
- ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
- ggml_backend_buft_name(buft));
- break;
- }
- }
- }
-
- if (!buft) {
- buft = select_weight_buft(hparams, t_meta, op, *buft_list);
- if (!buft) {
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
- }
- }
-
- // avoid using a host buffer when using mmap
- auto * buft_dev = ggml_backend_buft_get_device(buft);
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- if (!cpu_dev) {
- throw std::runtime_error("no CPU backend found");
- }
- buft = ggml_backend_dev_buffer_type(cpu_dev);
- }
-
- if (buft != buft_list->front().second) {
- n_moved_tensors++;
- if (!first_moved_tensor) {
- first_moved_tensor = t_meta;
- first_moved_from_buft = buft_list->front().second;
- first_moved_to_buft = buft;
- }
- }
-
- ggml_context * ctx = ctx_for_buft(buft);
-
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
- if (flags & TENSOR_DUPLICATED) {
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
- if (t) {
- return t;
- }
- }
- return ml.create_tensor(ctx, tn, ne, flags);
+ const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
+ return ml.create_tensor(
+ hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
+ tn, ne, flags);
};
layers.resize(n_layer);
} break;
case LLM_ARCH_LLAMA4:
{
+ if (n_expert == 0) {
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
+ }
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
}
for (int i = 0; i < n_layer; ++i) {
- bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
+ const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
auto & layer = layers[i];
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
if (is_moe_layer) {
- int n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_ff_exp = hparams.n_ff_exp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
} break;
case LLM_ARCH_MINICPM3:
{
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
case LLM_ARCH_GROK:
{
if (n_expert == 0) {
- throw std::runtime_error("Grok model cannot have zero experts");
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
}
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_JINA_BERT_V3:
{
+ if (n_token_types == 0) {
+ throw std::runtime_error(arch_name() + " model needs to define token type count");
+ }
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ // FIXME test-llama-archs crashes if q_norm is created
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
// attention parameters
- const uint32_t qk_dim = hparams.n_embd_head_k;
- const uint32_t v_dim = hparams.n_embd_head_v;
+ const uint32_t qk_dim = hparams.n_embd_head_k();
+ const uint32_t v_dim = hparams.n_embd_head_v();
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
} break;
case LLM_ARCH_PLAMO3:
{
- const int64_t head_dim_q = hparams.n_embd_head_k;
- const int64_t head_dim_v = hparams.n_embd_head_v;
+ const int64_t head_dim_q = hparams.n_embd_head_k();
+ const int64_t head_dim_v = hparams.n_embd_head_v();
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
} break;
case LLM_ARCH_SEED_OSS:
{
- const uint32_t head_dim = hparams.n_embd_head_k;
+ const uint32_t head_dim = hparams.n_embd_head_k();
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+ GGML_ASSERT(n_embd_head_qk_nope >= 1);
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
} break;
case LLM_ARCH_PLM:
{
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const int64_t kv_lora_rank = hparams.n_lora_kv;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
} break;
case LLM_ARCH_T5:
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
// this tensor seems to be unused in HF transformers implementation
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+ layer.attn_rel_b_cross = create_tensor(
+ tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t n_ssm_head = hparams.ssm_dt_rank;
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+ const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
// embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
// MoE branch
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_latent_up = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP, "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, moe_n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
// Shared expert branch
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
- const int64_t n_ff_shexp = hparams.n_ff_shexp;
- const int64_t head_dim = hparams.n_embd_head_k;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
+ const int64_t head_dim = hparams.n_embd_head_k();
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
}
} break;
case LLM_ARCH_HUNYUAN_DENSE:
const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
const int64_t ssm_d_conv = hparams.ssm_d_conv;
- // Try loading KDA specific tensors (using SSM_ prefix)
- // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
- // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
- layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
- if (!layer.ssm_q_conv) {
- layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
- }
+ if (hparams.is_recurrent(i)) {
+ // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+ // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_q_conv) {
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+ }
- if (layer.ssm_q_conv) {
// KDA Layer - Conv1d weights may be 3D or 4D
layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
if (!layer.ssm_k_conv) {
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
- const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
+ const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
+ {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
if (!layer.wkv_b) { // MLA KV cache enabled
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
} break;
case LLM_ARCH_QWEN3NEXT:
{
+ if (n_expert == 0) {
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
+ }
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
// Shared experts
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
}
} break;
case LLM_ARCH_QWEN35MOE:
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
uint32_t n_rot_max = 0;
for (int i = 0; i < n_layer; ++i) {
- n_rot_max = std::max(n_rot_max, hparams.n_rot);
+ n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
}
if (n_rot_max == 0) {
n_rot_max = n_rot;
throw std::runtime_error("unknown architecture");
}
- if (n_moved_tensors > 0) {
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+ // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
+ // this avoids having to add scale loading to every architecture
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // attention weight scales (per-tensor, shape {1})
+ if (!layer.wq_s && layer.wq) {
+ layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.wk_s && layer.wk) {
+ layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.wv_s && layer.wv) {
+ layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.wo_s && layer.wo) {
+ layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.wqkv_s && layer.wqkv) {
+ layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.wqkv_gate_s && layer.wqkv_gate) {
+ layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+
+ // dense FFN weight scales (per-tensor, shape {1})
+ if (!layer.ffn_gate_s && layer.ffn_gate) {
+ layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_down_s && layer.ffn_down) {
+ layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_up_s && layer.ffn_up) {
+ layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
+ layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
+ layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
+ layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+
+ // MoE expert weight scales (per-expert, shape {n_expert})
+ if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
+ layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
+ layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
+ layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ }
+
+ // recurrent / linear-attention weight scales (per-tensor, shape {1})
+ if (!layer.ssm_out_s && layer.ssm_out) {
+ layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ssm_alpha_s && layer.ssm_alpha) {
+ layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ if (!layer.ssm_beta_s && layer.ssm_beta) {
+ layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
}
}
// create the backend buffers
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
- ctx_buf_maps.reserve(ctx_map.size());
+ ctx_buf_maps.reserve(ml.ctx_map.size());
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+ const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
- for (auto & [buft, ctx_ptr] : ctx_map) {
+ for (auto & [buft, ctx_ptr] : ml.ctx_map) {
ggml_context * ctx = ctx_ptr.get();
// skip contexts without tensors
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa);
+ LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa);
+ LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa);
}
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
LLM_TYPE_0_3B,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
+ LLM_TYPE_0_8B,
LLM_TYPE_1B,
LLM_TYPE_1_2B,
LLM_TYPE_1_3B,
LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
+ LLM_TYPE_120B_A12B, // Nemotron 3 Super
+ LLM_TYPE_122B_A10B, // Qwen3.5
LLM_TYPE_196B_A11B, // Step3.5-Flash
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5
+ LLM_TYPE_397B_A17B, // Qwen3.5
LLM_TYPE_744B_A40B, // GLM-5
LLM_TYPE_E2B,
LLM_TYPE_E4B,
struct ggml_tensor * ffn_up_exps_b = nullptr;
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
+ // ff MoE per-expert scales (NVFP4 per-tensor scale2)
+ struct ggml_tensor * ffn_gate_exps_s = nullptr;
+ struct ggml_tensor * ffn_down_exps_s = nullptr;
+ struct ggml_tensor * ffn_up_exps_s = nullptr;
+
+ // ff MoE latent proj
+ struct ggml_tensor * ffn_latent_down = nullptr;
+ struct ggml_tensor * ffn_latent_up = nullptr;
+
// ff shared expert (shexp)
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * ffn_gate_shexp = nullptr;
struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
- struct ggml_tensor * wq_scale = nullptr;
- struct ggml_tensor * wk_scale = nullptr;
- struct ggml_tensor * wv_scale = nullptr;
- struct ggml_tensor * wo_scale = nullptr;
- struct ggml_tensor * ffn_gate_scale = nullptr;
- struct ggml_tensor * ffn_up_scale = nullptr;
- struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * wq_s = nullptr;
+ struct ggml_tensor * wk_s = nullptr;
+ struct ggml_tensor * wv_s = nullptr;
+ struct ggml_tensor * wo_s = nullptr;
+ struct ggml_tensor * wqkv_s = nullptr;
+ struct ggml_tensor * wqkv_gate_s = nullptr;
+ struct ggml_tensor * ffn_gate_s = nullptr;
+ struct ggml_tensor * ffn_up_s = nullptr;
+ struct ggml_tensor * ffn_down_s = nullptr;
+ struct ggml_tensor * ffn_gate_shexp_s = nullptr;
+ struct ggml_tensor * ffn_up_shexp_s = nullptr;
+ struct ggml_tensor * ffn_down_shexp_s = nullptr;
+ struct ggml_tensor * ssm_out_s = nullptr;
+ struct ggml_tensor * ssm_alpha_s = nullptr;
+ struct ggml_tensor * ssm_beta_s = nullptr;
// altup & laurel
struct ggml_tensor * per_layer_inp_gate = nullptr;
-#include "llama-quant.h"
+#include "llama.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-model-loader.h"
-#include <algorithm>
#include <cmath>
#include <cstring>
+#include <string>
#include <cinttypes>
#include <fstream>
#include <mutex>
#include <thread>
#include <unordered_map>
-// Quantization types. Changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
+// result of parsing --tensor-type option
+// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
+struct tensor_type_option {
std::string name;
- ggml_type quant = GGML_TYPE_COUNT;
+ ggml_type type = GGML_TYPE_COUNT;
+};
+
+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+ TOKEN_EMBD,
+ ATTENTION_Q,
+ ATTENTION_V,
+ ATTENTION_K,
+ ATTENTION_QKV,
+ ATTENTION_KV_B,
+ ATTENTION_OUTPUT,
+ FFN_UP,
+ FFN_GATE,
+ FFN_DOWN,
+ OUTPUT,
+ OTHER
};
static void zeros(std::ofstream & file, size_t n) {
return orig_name;
}
-static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
+static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
if (mapped.empty()) {
return orig_name;
}
return orig_name;
}
+//
+// helper functions for tensor name matching
+//
+
+static bool tensor_name_match_token_embd(const char * tensor_name) {
+ return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
+ std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
+}
+
+static bool tensor_name_match_output_weight(const char * tensor_name) {
+ return std::strcmp(tensor_name, "output.weight") == 0;
+}
+
+//
+// tensor categorization for quantization
+//
+// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
+//
+
+static tensor_category tensor_get_category(const std::string & tensor_name) {
+ if (tensor_name_match_output_weight(tensor_name.c_str())) {
+ return tensor_category::OUTPUT;
+ }
+ if (tensor_name_match_token_embd(tensor_name.c_str())) {
+ return tensor_category::TOKEN_EMBD;
+ }
+ if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_QKV;
+ }
+ if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_KV_B;
+ }
+ if (tensor_name.find("attn_v.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_V;
+ }
+ if (tensor_name.find("attn_k.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_K;
+ }
+ if (tensor_name.find("attn_q.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_Q;
+ }
+ if (tensor_name.find("attn_output.weight") != std::string::npos) {
+ return tensor_category::ATTENTION_OUTPUT;
+ }
+ if (tensor_name.find("ffn_up") != std::string::npos) {
+ return tensor_category::FFN_UP;
+ }
+ if (tensor_name.find("ffn_gate") != std::string::npos) {
+ return tensor_category::FFN_GATE;
+ }
+ if (tensor_name.find("ffn_down") != std::string::npos) {
+ return tensor_category::FFN_DOWN;
+ }
+ return tensor_category::OTHER;
+}
+
+// check if category is for attention-v-like tensors (more sensitive to quantization)
+static bool category_is_attn_v(tensor_category cat) {
+ return cat == tensor_category::ATTENTION_V ||
+ cat == tensor_category::ATTENTION_QKV ||
+ cat == tensor_category::ATTENTION_KV_B;
+}
+
+//
+// quantization state
+//
+
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
int i_ffn_gate = 0;
int i_ffn_up = 0;
- int n_k_quantized = 0;
int n_fallback = 0;
bool has_imatrix = false;
- // used to figure out if a model shares tok_embd with the output weight
- bool has_output = false;
+ // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
+ bool has_tied_embeddings = true; // assume tied until we see output.weight
+
+ // tensor type override patterns (compiled once, used twice)
+ std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
+
+ quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
+ model(model), params(params)
+ {
+ // compile regex patterns once - they are expensive
+ if (params->tensor_types) {
+ const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
+ for (const auto & [tname, qtype] : tensor_types) {
+ tensor_type_patterns.emplace_back(std::regex(tname), qtype);
+ }
+ }
+ }
+};
- quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
- : model(model)
- , params(params)
- {}
+// per-tensor metadata, computed in the preliminary loop and used in the main loop
+struct tensor_metadata {
+ ggml_type target_type;
+ tensor_category category;
+ std::string remapped_imatrix_name;
+ bool allows_quantization;
+ bool requires_imatrix;
};
+//
+// dequantization
+//
+
static void llama_tensor_dequantize_impl(
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
workers.clear();
}
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+//
+// do we allow this tensor to be quantized?
+//
+
+static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+ // trivial checks first -- no string ops needed
+ if (params->only_copy) return false;
+
+ // quantize only 2D and 3D tensors (experts)
+ if (ggml_n_dims(tensor) < 2) return false;
+
+ const std::string name = ggml_get_name(tensor);
+
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
+ bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+ // do not quantize norm tensors
+ quantize &= name.find("_norm.weight") == std::string::npos;
+
+ quantize &= params->quantize_output_tensor || name != "output.weight";
+
+ // do not quantize expert gating tensors
+ // NOTE: can't use LLM_TN here because the layer number is not known
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+ // these are very small (e.g. 4x4)
+ quantize &= name.find("altup") == std::string::npos;
+ quantize &= name.find("laurel") == std::string::npos;
+
+ // these are not too big so keep them as it is
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
+ // do not quantize positional embeddings and token types (BERT)
+ quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
+ quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+ // do not quantize Mamba/Kimi's small conv1d weights
+ // NOTE: can't use LLM_TN here because the layer number is not known
+ quantize &= name.find("ssm_conv1d") == std::string::npos;
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+
+ // do not quantize RWKV's small yet 2D weights
+ quantize &= name.find("time_mix_first.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_g2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+ // do not quantize relative position bias (T5)
+ quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+ // do not quantize specific multimodal tensors
+ quantize &= name.find(".position_embd.") == std::string::npos;
+
+ return quantize;
+}
+
+//
+// tensor type selection
+//
+
+// incompatible tensor shapes are handled here - fallback to a compatible type
+static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
+ ggml_type return_type = target_type;
+
+ const int64_t ncols = t->ne[0];
+ const int64_t qk_k = ggml_blck_size(target_type);
+
+ if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
+ LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
+ t->name, ncols, qk_k, ggml_type_name(target_type));
+ ++qs.n_fallback;
+
+ switch (target_type) {
+ // types on the left: block size 256
+ case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
+ case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ3_S: // types on the right: block size 32
+ case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break;
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_TQ1_0:
+ case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break;
+ case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break;
+ case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break;
+ case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break;
+ default:
+ throw std::runtime_error(format("no tensor type fallback is defined for type %s",
+ ggml_type_name(target_type)));
+ }
+ if (ncols % ggml_blck_size(return_type) != 0) {
+ //
+ // the fallback return type is still not compatible for this tensor!
+ //
+ // most likely, this tensor's first dimension is not divisible by 32.
+ // this is very rare. we can either abort the quantization, or
+ // fallback to F16 / F32.
+ //
+ LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
+ return_type = GGML_TYPE_F16;
+ }
+ LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
+ }
+ return return_type;
+}
+
+// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
const llm_arch arch = qs.model.arch;
- const auto tn = LLM_TN(arch);
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
- if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+ if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
} else {
} else {
new_type = GGML_TYPE_Q8_0;
}
- } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
+ } else if (category == tensor_category::TOKEN_EMBD) {
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
new_type = qs.params->token_embedding_type;
} else {
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
- if (name.find("attn_v.weight") != std::string::npos) {
+ if (category_is_attn_v(category)) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
- else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+ else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
new_type = GGML_TYPE_Q4_K;
}
- else if (name.find("ffn_down") != std::string::npos) {
+ else if (category == tensor_category::FFN_DOWN) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_down;
}
- else if (name.find("attn_output.weight") != std::string::npos) {
+ else if (category == tensor_category::ATTENTION_OUTPUT) {
if (qs.model.hparams.n_expert == 8) {
new_type = GGML_TYPE_Q5_K;
} else {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
}
}
- } else if (name.find("attn_v.weight") != std::string::npos) {
+ } else if (category_is_attn_v(category)) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
new_type = GGML_TYPE_Q8_0;
}
++qs.i_attention_wv;
- } else if (name.find("attn_k.weight") != std::string::npos) {
+ } else if (category == tensor_category::ATTENTION_K) {
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
- } else if (name.find("attn_q.weight") != std::string::npos) {
+ } else if (category == tensor_category::ATTENTION_Q) {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
- } else if (name.find("ffn_down") != std::string::npos) {
+ } else if (category == tensor_category::FFN_DOWN) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
++qs.i_ffn_down;
- } else if (name.find("attn_output.weight") != std::string::npos) {
+ } else if (category == tensor_category::ATTENTION_OUTPUT) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
}
}
- else if (name.find("attn_qkv.weight") != std::string::npos) {
+ else if (category == tensor_category::ATTENTION_QKV) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
- else if (name.find("ffn_gate") != std::string::npos) {
+ else if (category == tensor_category::FFN_GATE) {
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
}
++qs.i_ffn_gate;
}
- else if (name.find("ffn_up") != std::string::npos) {
+ else if (category == tensor_category::FFN_UP) {
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
return new_type;
}
+// outer wrapper: determine the ggml_type that this tensor should be quantized to
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+ if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
+ return tensor->type;
+ }
+ if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
+ return params->token_embedding_type;
+ }
+ if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
+ return params->output_tensor_type;
+ }
+
+ ggml_type new_type = default_type;
+
+ // get more optimal quantization type based on the tensor shape, layer, etc.
+ if (!params->pure && ggml_is_quantized(default_type)) {
+ // if the user provided tensor types - use those
+ bool manual = false;
+ if (!qs.tensor_type_patterns.empty()) {
+ const std::string tensor_name(tensor->name);
+ for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
+ if (std::regex_search(tensor_name, pattern)) {
+ if (qtype != new_type) {
+ LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
+ __func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
+ new_type = qtype;
+ manual = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+ if (!manual) {
+ new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
+ }
+
+ // incompatible tensor shapes are handled here - fallback to a compatible type
+ new_type = tensor_type_fallback(qs, tensor, new_type);
+ }
+
+ return new_type;
+}
+
+//
+// quantization implementation
+//
+
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) {
// single-thread
return new_size;
}
-static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
- return (
- dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
- dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
- dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M ||
- ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
- dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
- )
- );
+//
+// imatrix requirement check
+//
+
+static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
+ if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
+ return false;
+ }
+ switch (dst_type) {
+ case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
+ case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_IQ1_S:
+ return true;
+ case GGML_TYPE_Q2_K:
+ // as a general rule, the k-type quantizations don't require imatrix data.
+ // the only exception is Q2_K tensors that are part of a Q2_K_S file.
+ return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
+ default:
+ return false;
+ }
}
-static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
- ggml_type default_type;
- llama_ftype ftype = params->ftype;
+//
+// given a file type, get the default tensor type
+//
- switch (params->ftype) {
- case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
- case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
- case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
- case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
- case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
- case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
- case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
- case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
+static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
+ switch (ftype) {
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
+ case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
+ case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
+ case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
- case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
- case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K;
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K;
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
- case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
- case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
- case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
- case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
- case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
- case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
- case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
- case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K;
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K;
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0;
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS;
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS;
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S;
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S;
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M;
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL;
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
+ case LLAMA_FTYPE_MOSTLY_IQ3_S:
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
+}
+
+//
+// main quantization driver
+//
+
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+ ggml_type default_type;
+ llama_ftype ftype = params->ftype;
int nthread = params->nthread;
nthread = std::thread::hardware_concurrency();
}
+ default_type = llama_ftype_get_default_type(ftype);
+
// mmap consistently increases speed on Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
}
std::vector<std::string> splits = {};
- llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
+ fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());
if (params->imatrix) {
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
- LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+ LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
+ __func__, (int)imatrix_data->size());
qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
}
// copy the KV pairs from the input file
- gguf_set_kv (ctx_out.get(), ml.meta.get());
+ gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
});
}
- for (const auto * it : tensors) {
- const struct ggml_tensor * tensor = it->tensor;
-
- const std::string name = ggml_get_name(tensor);
-
- // TODO: avoid hardcoded tensor names - use the TN_* constants
- if (name.find("attn_v.weight") != std::string::npos ||
- name.find("attn_qkv.weight") != std::string::npos ||
- name.find("attn_kv_b.weight")!= std::string::npos) {
- ++qs.n_attention_wv;
- } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
- qs.has_output = true;
- }
- }
-
- qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<std::thread> workers;
- workers.reserve(nthread);
-
int idx = 0;
-
- std::vector<no_init<uint8_t>> read_data;
- std::vector<no_init<uint8_t>> work;
- std::vector<no_init<float>> f32_conv_buf;
-
uint16_t n_split = 1;
// Assume split index is continuous
std::vector<gguf_context_ptr> ctx_outs(n_split);
ctx_outs[0] = std::move(ctx_out);
- // populate the original tensors so we get an initial meta data
- for (const auto * it : tensors) {
+ // compute tensor metadata once and cache it
+ std::vector<tensor_metadata> metadata(tensors.size());
+
+ // initialize quantization state before preliminary loop (counters for use_more_bits)
+ {
+ for (size_t i = 0; i < tensors.size(); ++i) {
+ const auto cat = tensor_get_category(tensors[i]->tensor->name);
+ if (category_is_attn_v(cat)) {
+ ++qs.n_attention_wv;
+ }
+ if (cat == tensor_category::OUTPUT) {
+ qs.has_tied_embeddings = false;
+ }
+ metadata[i].category = cat; // save and re-use the category while we're at it
+ }
+ // these also need to be set to n_layer by default
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+ }
+
+ // flag for --dry-run
+ bool will_require_imatrix = false;
+
+ //
+ // preliminary iteration over all weights
+ //
+
+ for (size_t i = 0; i < tensors.size(); ++i) {
+ const auto * it = tensors[i];
+ const struct ggml_tensor * tensor = it->tensor;
+ const std::string name = ggml_get_name(tensor);
+
uint16_t i_split = params->keep_split ? it->idx : 0;
- ggml_tensor * tensor = it->tensor;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());
}
gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+
+ metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
+
+ if (metadata[i].allows_quantization) {
+ metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
+ } else {
+ metadata[i].target_type = tensor->type;
+ }
+
+ metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
+
+ if (params->imatrix) {
+ metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
+ } else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
+ if (params->dry_run) {
+ will_require_imatrix = true;
+ } else {
+ LLAMA_LOG_ERROR("\n============================================================================\n"
+ " ERROR: this quantization requires an importance matrix!\n"
+ " - offending tensor: %s\n"
+ " - target type: %s\n"
+ "============================================================================\n\n",
+ name.c_str(), ggml_type_name(metadata[i].target_type));
+ throw std::runtime_error("this quantization requires an imatrix!");
+ }
+ }
}
// Set split info if needed
}
}
+ size_t total_size_org = 0;
+ size_t total_size_new = 0;
+
+ std::vector<std::thread> workers;
+ workers.reserve(nthread);
+
+ std::vector<no_init<uint8_t>> read_data;
+ std::vector<no_init<uint8_t>> work;
+ std::vector<no_init<float>> f32_conv_buf;
+
int cur_split = -1;
std::ofstream fout;
auto close_ofstream = [&]() {
::zeros(fout, meta_size);
};
- const auto tn = LLM_TN(model.arch);
-
// no output file for --dry-run
if (!params->dry_run) {
new_ofstream(0);
}
- // flag for `--dry-run`, to let the user know if imatrix will be required for a real
- // quantization, as a courtesy
- bool will_require_imatrix = false;
+ //
+ // main loop: iterate over all weights
+ //
- for (const auto * it : tensors) {
- const auto & weight = *it;
+ for (size_t i = 0; i < tensors.size(); ++i) {
+ const auto & weight = *tensors[i];
+ const auto & tm = metadata[i];
ggml_tensor * tensor = weight.tensor;
+
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
ml.load_data_for(tensor);
}
- LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+ LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
- // This used to be a regex, but <regex> has an extreme cost to compile times.
- bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
- // quantize only 2D and 3D tensors (experts)
- quantize &= (ggml_n_dims(tensor) >= 2);
-
- // do not quantize norm tensors
- quantize &= name.find("_norm.weight") == std::string::npos;
-
- quantize &= params->quantize_output_tensor || name != "output.weight";
- quantize &= !params->only_copy;
-
- // do not quantize expert gating tensors
- // NOTE: can't use LLM_TN here because the layer number is not known
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
- // these are very small (e.g. 4x4)
- quantize &= name.find("altup") == std::string::npos;
- quantize &= name.find("laurel") == std::string::npos;
-
- // these are not too big so keep them as it is
- quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
- // do not quantize positional embeddings and token types (BERT)
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
- // do not quantize Mamba /Kimi's small conv1d weights
- // NOTE: can't use LLM_TN here because the layer number is not known
- quantize &= name.find("ssm_conv1d") == std::string::npos;
- quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
- // do not quantize RWKV's small yet 2D weights
- quantize &= name.find("time_mix_first.weight") == std::string::npos;
- quantize &= name.find("time_mix_w0.weight") == std::string::npos;
- quantize &= name.find("time_mix_w1.weight") == std::string::npos;
- quantize &= name.find("time_mix_w2.weight") == std::string::npos;
- quantize &= name.find("time_mix_v0.weight") == std::string::npos;
- quantize &= name.find("time_mix_v1.weight") == std::string::npos;
- quantize &= name.find("time_mix_v2.weight") == std::string::npos;
- quantize &= name.find("time_mix_a0.weight") == std::string::npos;
- quantize &= name.find("time_mix_a1.weight") == std::string::npos;
- quantize &= name.find("time_mix_a2.weight") == std::string::npos;
- quantize &= name.find("time_mix_g1.weight") == std::string::npos;
- quantize &= name.find("time_mix_g2.weight") == std::string::npos;
- quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
- quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
- quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
- // do not quantize relative position bias (T5)
- quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-
- // do not quantize specific multimodal tensors
- quantize &= name.find(".position_embd.") == std::string::npos;
-
- ggml_type new_type;
- void * new_data;
- size_t new_size;
-
- if (quantize) {
- new_type = default_type;
-
- // get more optimal quantization type based on the tensor shape, layer, etc.
- if (!params->pure && ggml_is_quantized(default_type)) {
- // if the user provided tensor types - use those
- bool manual = false;
- if (params->tensor_types) {
- const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
- const std::string tensor_name(tensor->name);
- for (const auto & [tname, qtype] : tensor_types) {
- if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
- if (qtype != new_type) {
- LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
- new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
- manual = true;
- break;
- }
- }
- }
- }
-
- // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
- if (!manual) {
- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
- }
-
- // incompatible tensor shapes are handled here - fallback to a compatible type
- {
- bool convert_incompatible_tensor = false;
+ const ggml_type cur_type = tensor->type;
+ const ggml_type new_type = tm.target_type;
- const int64_t nx = tensor->ne[0];
- const int64_t ny = tensor->ne[1];
- const int64_t qk_k = ggml_blck_size(new_type);
-
- if (nx % qk_k != 0) {
- LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
- convert_incompatible_tensor = true;
- } else {
- ++qs.n_k_quantized;
- }
-
- if (convert_incompatible_tensor) {
- switch (new_type) {
- case GGML_TYPE_TQ1_0:
- case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ3_S:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
- default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
- }
- if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
- new_type = GGML_TYPE_F16;
- }
- LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
- ++qs.n_fallback;
- }
- }
- }
- if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
- new_type = params->token_embedding_type;
- }
- if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
- new_type = params->output_tensor_type;
- }
+ // If we've decided to quantize to the same type the tensor is already
+ // in then there's nothing to do.
+ bool quantize = cur_type != new_type;
- // If we've decided to quantize to the same type the tensor is already
- // in then there's nothing to do.
- quantize = tensor->type != new_type;
- }
+ void * new_data;
+ size_t new_size;
- // we have now decided on the target type for this tensor
if (params->dry_run) {
- // the --dry-run option calculates the final quantization size without quantizting
+ // the --dry-run option calculates the final quantization size without quantizing
if (quantize) {
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
tensor_size/1024.0/1024.0,
new_size/1024.0/1024.0,
ggml_type_name(new_type));
- if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ if (!will_require_imatrix && tm.requires_imatrix) {
will_require_imatrix = true;
}
} else {
} else {
// no --dry-run, perform quantization
if (!quantize) {
- new_type = tensor->type;
new_data = tensor->data;
new_size = tensor_size;
LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
const float * imatrix = nullptr;
if (imatrix_data) {
- auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+ auto it = imatrix_data->find(tm.remapped_imatrix_name);
if (it == imatrix_data->end()) {
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
// this is a significant error and it may be good idea to abort the process if this happens,
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
// tok_embd should be ignored in this case, since it always causes this warning
- if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+ if (!tensor_name_match_token_embd(tensor->name)) {
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
}
}
}
}
- if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ if (!imatrix && tm.requires_imatrix) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
-
- // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 0
- if (new_type == GGML_TYPE_MXFP4) {
- auto * x = f32_data_03;
-
- //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
- std::vector<float> deq(nrows*n_per_row);
- const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
- qtype->to_float(new_data_03, deq.data(), deq.size());
-
- double err = 0.0f;
- for (int i = 0; i < (int) deq.size(); ++i) {
- err += fabsf(deq[i] - x[i]);
- //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
- if (deq[i] != x[i]) {
- LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
- }
- }
- //LLAMA_LOG_INFO("err = %f\n", err);
- GGML_ASSERT(err == 0.00000);
- }
-#endif
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
}
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
} // no --dry-run
- } // iterate over tensors
+ } // main loop
if (!params->dry_run) {
close_ofstream();
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
- __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+ __func__, qs.n_fallback, ml.n_tensors);
}
}
};
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
- struct gguf_context * ctx = ml.meta.get();
+ struct gguf_context * ctx = ml.metadata;
// determine vocab type
{
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- // correct endiannes of data in precompiled_charsmap binary blob
+ // correct endianness of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
#include "llama.h"
+#include "ggml-cpp.h"
#include "llama-impl.h"
#include "llama-chat.h"
#include "ggml.h"
#include "ggml-backend.h"
+#include "gguf.h"
#include <algorithm>
#include <cassert>
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
+ const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
model.t_start_us = tm.t_start_us;
try {
- llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+ llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
+ params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();
}
static struct llama_model * llama_model_load_from_file_impl(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & path_model,
std::vector<std::string> & splits,
struct llama_model_params params) {
+ GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
ggml_time_init();
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
props.memory_free/1024/1024);
}
- const int status = llama_model_load(path_model, splits, *model, params);
+ const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
return model;
}
+struct llama_model * llama_model_init_from_user(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
+ struct llama_model_params params) {
+ GGML_ASSERT(metadata != nullptr);
+ std::string path_model;
+ std::vector<std::string> splits = {};
+ params.use_mmap = false;
+ params.use_extra_bufts = false;
+ return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
+}
// deprecated
struct llama_model * llama_load_model_from_file(
const char * path_model,
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
- return llama_model_load_from_file_impl(path_model, splits, params);
+ return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
}
struct llama_model * llama_model_load_from_splits(
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
- return llama_model_load_from_file_impl(splits.front(), splits, params);
+ return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
}
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
- llama_model_saver ms(*model);
+ llama_model_saver ms(model);
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(path_model);
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
+#include "gguf.h"
#include <stddef.h>
#include <stdint.h>
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+ typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
+
+ // Create a new model from GGUF metadata as well as a function to set the tensor data
+ // - tensors are created as GGML_TYPE_F32 by default,
+ // override by adding a tensor with the same name but a different name to the context
+ LLAMA_API struct llama_model * llama_model_init_from_user(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
+ void * set_tensor_data_ud, // userdata for function
+ struct llama_model_params params);
+
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params),
"use llama_model_load_from_file instead");
- // Load the model from a file
+ // Load a model from a file
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
// If the split file name does not follow this pattern, use llama_model_load_from_splits
LLAMA_API struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params);
- // Load the model from multiple splits (support custom naming scheme)
+ // Load a model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
const char ** paths,
// Logits for the ith token. For positive indices, Equivalent to:
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
- // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+ // Negative indices can be used to access logits in reverse order, -1 is the last logit.
// returns NULL for invalid ids.
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for the ith token. For positive indices, Equivalent to:
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
- // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+ // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
// shape: [n_embd] (1-dimensional)
// returns NULL for invalid ids.
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
// Returns LLAMA_TOKEN_NULL if no token was sampled.
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
- // Get the backend sampled probabilites for the ith token
+ // Get the backend sampled probabilities for the ith token
// The index matches llama_get_sampled_token_ith().
- // Returns NULL if no probabilites were generated.
+ // Returns NULL if no probabilities were generated.
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
float tau,
float eta);
- /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+ /// @details Initializes a GBNF grammar, see grammars/README.md for details.
/// @param vocab The vocabulary that this grammar will be used with.
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
/// @param grammar_root The name of the start symbol for the grammar.
#include "models.h"
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
n_expert, n_expert_used,
LLM_FFN_SILU,
hparams.expert_weights_norm, // norm_w (route_norm=True)
- hparams.expert_weights_scale, // scale_w
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
-
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
);
break;
case LLM_TYPE_13B:
+ case LLM_TYPE_UNKNOWN:
break;
default:
GGML_ABORT("fatal error");
#include "models.h"
-
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- false, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// feed-forward network
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
- cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
- model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
- LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ nullptr,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ hparams.n_expert, hparams.n_expert_used,
+ LLM_FFN_GELU, false,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
model.arch == LLM_ARCH_JINA_BERT_V3) {
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// self-attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_scale) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
- }
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
}
// B1.K
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_scale) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
- }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
}
// B1.V
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_scale) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
- }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
LLM_NORM_RMS, il);
cb(cur, "attn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].wo, cur);
- if (model.layers[il].wo_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
- }
+ cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
- model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
NULL, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
LLM_NORM_RMS, il);
cb(cur, "ffn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].ffn_down, cur);
- if (model.layers[il].ffn_down_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
- }
+ cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
cb(cur, "ffn_down", il);
cur = ggml_add(ctx0, cur, ffn_inp);
#include "models.h"
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include <float.h>
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * inpL;
ggml_tensor * cur;
#include "models.h"
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;
#include "models.h"
-
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
-
-
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
cb(Qcur, "Qcur_attn_temp_scaled", il);
}
- // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+ // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- hparams.expert_weights_scale, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il,
nullptr,
#include "models.h"
-#define CHUNK_SIZE 64
+#include "llama-impl.h"
// utility to get one slice from the third dimension
// input dim: [x, y, c, b]
g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
- const int CS = CHUNK_SIZE;
+ const int CS = kda ? 16 : 64; // chunk size
const int pad = (CS - n_tokens % CS) % CS;
const int n_chunks = (n_tokens + pad) / CS;
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
cb(kg_t, "key_gdiff_t", il);
- ggml_tensor * s_t = ggml_transpose(ctx0, s);
- s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
- cb(s_t, "dnet_add_ch_state", il);
+ s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
+ cb(s, "dnet_add_ch_state", il);
// [CS, S_v, n_chunks, H_v * n_seqs]
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
// [CS, S_v, 1, H_v * n_seqs]
- ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
+ ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
cb(v_t_p, "v_prime", il);
// [CS, S_v, 1, H_v * n_seqs]
cb(v_attn, "v_attn", il);
// [S_v, CS, 1, H_v * n_seqs]
- ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
cb(attn_inter, "attn_inter", il);
// [S_v, CS, 1, H_v * n_seqs]
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
- s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
- s_t = ggml_add(ctx0, s_t, kgv);
- cb(s_t, "dnet_add_ch_state", il);
+ s = ggml_mul(ctx0, s, ch_g_last_exp_t);
+ s = ggml_add(ctx0, s, kgv);
+ cb(s, "dnet_add_ch_state", il);
}
- s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
-
// truncate padded tokens
ggml_tensor * o = ggml_view_4d(ctx0, v,
S_v, n_tokens, H_v, n_seqs,
ggml_row_size(v->type, S_v * CS * n_chunks),
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
- s = ggml_transpose(ctx0, s_t);
+ s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
cb(s, "output_state", il);
return {o, s};
g = ggml_exp(ctx0, g);
s = ggml_mul(ctx0, s, g);
- ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
-
// [1, S_v, H_v, n_seqs]
ggml_tensor * sk;
- sk = ggml_mul (ctx0, s_t, k);
+ sk = ggml_mul (ctx0, s, k);
sk = ggml_sum_rows(ctx0, sk);
// [S_v, 1, H_v, n_seqs]
k = ggml_repeat(ctx0, k, s);
kd = ggml_mul (ctx0, k, d_t);
- s_t = ggml_add(ctx0, s_t, kd);
+ s = ggml_add(ctx0, s, kd);
- cb(s_t, "dnet_add_ar_state", il);
+ cb(s, "dnet_add_ar_state", il);
- ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
+ ggml_tensor * s_q = ggml_mul (ctx0, s, q);
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
- s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {o, s};
}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_fused(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(S_k == S_v);
+ GGML_ASSERT(H_v % H_k == 0);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
+
+ GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
+ GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
+ GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
+ GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
+
+ ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
+ if (n_tokens == 1) {
+ cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
+ } else {
+ cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
+ }
+
+ ggml_tensor * output = ggml_view_4d(ctx0, result,
+ S_v, H_v, n_tokens, n_seqs,
+ ggml_row_size(result->type, S_v),
+ ggml_row_size(result->type, S_v * H_v),
+ ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
+
+ ggml_tensor * new_state = ggml_view_4d(ctx0, result,
+ S_v, S_v, H_v, n_seqs,
+ ggml_row_size(result->type, S_v),
+ ggml_row_size(result->type, S_v * S_v),
+ ggml_row_size(result->type, S_v * S_v * H_v),
+ ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
+
+ return {output, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il) {
+ const int64_t n_seq_tokens = q->ne[2];
+
+ if (n_seq_tokens == 1) {
+ if (cparams.fused_gdn_ar) {
+ return build_delta_net_fused(q, k, v, g, b, s, il);
+ }
+ return build_delta_net_autoregressive(q, k, v, g, b, s, il);
+ }
+
+ if (cparams.fused_gdn_ch) {
+ return build_delta_net_fused(q, k, v, g, b, s, il);
+ }
+
+ return build_delta_net_chunking(q, k, v, g, b, s, il);
+}
#include "models.h"
-
-
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
//copied from qwen2
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
-
-
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
-
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
- n_embd_head(model.hparams.n_embd_head_k),
+ n_embd_head(model.hparams.n_embd_head_k()),
n_embd_altup(model.hparams.n_embd_altup),
n_altup(model.hparams.n_altup),
i_altup_act(model.hparams.i_altup_act) {
#include "models.h"
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(routed_out, "ffn_moe_out", il);
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
#include "models.h"
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
-
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
llm_build_granite::llm_build_granite(
const llama_model & model,
const llm_graph_params & params)
: llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_GELU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
-
-
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);
nullptr,
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);
#include "models.h"
llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
n_expert, n_expert_used,
LLM_FFN_SILU,
true, // norm_topk_prob
- false,
- 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur_moe, "ffn_moe_out", il);
#include "models.h"
llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// JAIS-2 model graph builder
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
-#include "ggml.h"
#include "llama-memory-recurrent.h"
const int64_t kv_lora_rank = hparams.n_lora_kv;
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
- const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
+ const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
// Attention scale for MLA
const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
ggml_build_forward_expand(gf, cur);
- // Check layer type by checking which tensors exist
- // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
- bool is_kda = (layer.ssm_a != nullptr);
- bool is_mla = (layer.wkv_a_mqa != nullptr);
-
- if (is_kda) {
+ if (hparams.is_recurrent(il)) {
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
// Reference: vLLM kda.py
const auto * mctx_cur = inp_rs->mctx;
Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
- std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
- build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
- build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
+ auto attn_out = build_delta_net(Qcur, Kcur, Vcur, g1, beta, state, il);
ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
ggml_tensor * new_state = attn_out.second;
cur = ggml_mul_mat(ctx0, layer.wo, gated);
cb(cur, "kda_out", il);
- } else if (is_mla) {
+ } else {
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
// Reference: vLLM mla.py
// Step 1: Q projection and reshape
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
}
- } else {
- // Unknown layer type - this should not happen
- GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
}
// On last layer, select only the output tokens
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
};
auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
return build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
- static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+ il);
};
auto build_attn_block = [&model, this](ggml_tensor * cur,
ggml_tensor * inp_pos,
inp_attn_type * inp_attn,
int il) -> ggml_tensor * {
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
- const auto n_embd_head = hparams.n_embd_head_v;
+ const auto n_embd_head = hparams.n_embd_head_v();
const auto n_head_kv = hparams.n_head_kv(il);
auto * q = build_lora_mm(model.layers[il].wq, cur);
#include "models.h"
llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
template <bool embed>
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ if (model.layers[il].wo_s) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+ }
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
+ il,
+ nullptr, nullptr,
+ model.layers[il].ffn_up_exps_s,
+ model.layers[il].ffn_gate_exps_s,
+ model.layers[il].ffn_down_exps_s);
cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
+ GGML_ASSERT(d_inner % d_state == 0);
+ GGML_ASSERT(d_inner % n_group == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-
#include "models.h"
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
cb(cur, "ffn_out", il);
} else {
// MoE branch
- cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
- 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+ il);
cb(cur, "ffn_moe_out", il);
}
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const uint32_t kv_lora_rank = hparams.n_lora_kv;
LLM_NORM_RMS, il);
cb(q, "q", il);
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+ // {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
-
#include "models.h"
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ // GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
ggml_tensor * cur;
ggml_tensor * inpL;
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "llama-model.h"
#include "llama-graph.h"
-// note: almost all graphs require atleast sqrtf, so include cmath globally
+// note: almost all graphs require at least sqrtf, so include cmath globally
#include <cmath>
//
ggml_tensor * b,
ggml_tensor * s,
int il);
+
+ // use the ggml_gated_delta_net fused operator
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il);
+
+ // choose one of two implementations above based on the number of tokens
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * b,
+ ggml_tensor * s,
+ int il);
};
struct llm_build_rwkv6_base : public llm_graph_context {
#include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
- ggml_tensor * ffn_inp = cur;
+ ggml_tensor * inp_emb = cur;
+ ggml_tensor * inp_latent = cur;
+
+ if (model.layers[il].ffn_latent_down) {
+ inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
+ }
+
+ ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
+ cb(router_logits, "ffn_moe_logits", il);
+
ggml_tensor * moe_out =
- build_moe_ffn(ffn_inp,
+ build_moe_ffn(inp_latent,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr, // no gate
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
- il);
+ il,
+ router_logits);
cb(moe_out, "ffn_moe_out", il);
- ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+ if (model.layers[il].ffn_latent_up) {
+ moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
+ }
+
+ ggml_tensor * ffn_shexp = build_ffn(inp_emb,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
#include "models.h"
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- //GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ //GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SWIGLU_OAI_MOE, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * attn_norm_output;
template<bool iswa>
llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
#include "models.h"
llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// check if this layer is Mamba or Attention
- bool is_mamba_layer = hparams.is_recurrent(il);
+ const bool is_mamba_layer = hparams.is_recurrent(il);
if (is_mamba_layer) {
// PLaMo-2 Mamba layer
cb(qkv, "wqkv", il);
// split QKV tensor into Q, K, V
- const int64_t n_embd_head_q = hparams.n_embd_head_k;
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_embd_head_q = hparams.n_embd_head_k();
+ const int64_t n_embd_head_k = hparams.n_embd_head_k();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v();
int32_t n_head = hparams.n_head(il);
int32_t n_head_kv = hparams.n_head_kv(il);
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
+ GGML_ASSERT(n_group == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
template <bool iswa>
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t head_dim_q = hparams.n_embd_head_k;
- const int64_t head_dim_v = hparams.n_embd_head_v;
+ const int64_t head_dim_q = hparams.n_embd_head_k();
+ const int64_t head_dim_v = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
#include "models.h"
llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const uint32_t kv_lora_rank = hparams.n_lora_kv;
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
v_states = ggml_cont(ctx0, v_states);
cb(v_states, "v_states", il);
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens,
+ ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head),
0);
cb(v_states, "v_states", il);
llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// self-attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ if (model.layers[il].wo_s) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+ }
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+ model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
llm_build_delta_net_base(params), model(model) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
- ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+ ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
- ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+ ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
cb(z, "z", il);
return { qkv_mixed, z };
ggml_tensor * inp_pos,
int * sections,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
// Qwen3Next uses a single Q projection that outputs query + gate
- ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+ ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
cb(Qcur_full, "Qcur_full", il);
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
// Apply K normalization
cur = ggml_mul(ctx0, cur, gate_sigmoid);
cb(cur, "attn_gated", il);
- cur = build_lora_mm(model.layers[il].wo, cur);
+ cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
cb(cur, "attn_output", il);
return cur;
ggml_tensor * qkv_mixed = qkvz.first;
ggml_tensor * z = qkvz.second;
- ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+ ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
cb(beta, "beta", il);
beta = ggml_sigmoid(ctx0, beta);
- ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+ ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
cb(alpha, "alpha", il);
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// if head keys and value keys are different, repeat to force tensors into matching shapes
- if (num_k_heads != num_v_heads) {
+ // note: need explicit repeat only if we are not using the fused GDN
+ if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
- // TODO: try to avoid these explicit repeats by utilizing op broadcast
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
}
cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il);
- // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
- std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
- if (n_seq_tokens == 1) {
- attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
- } else {
- attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
- }
+ auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
cb(final_output, "final_output", il);
// Output projection
- cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+ cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr);
cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+ model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
llm_build_delta_net_base(params), model(model) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
- ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+ ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
- ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+ ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
cb(z, "z", il);
return { qkv_mixed, z };
ggml_tensor * inp_pos,
int * sections,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
// Qwen3Next uses a single Q projection that outputs query + gate
- ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+ ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
cb(Qcur_full, "Qcur_full", il);
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
// Apply K normalization
cur = ggml_mul(ctx0, cur, gate_sigmoid);
cb(cur, "attn_gated", il);
- cur = build_lora_mm(model.layers[il].wo, cur);
+ cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
cb(cur, "attn_output", il);
return cur;
ggml_tensor * qkv_mixed = qkvz.first;
ggml_tensor * z = qkvz.second;
- ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+ ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
cb(beta, "beta", il);
beta = ggml_sigmoid(ctx0, beta);
- ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+ ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
cb(alpha, "alpha", il);
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// if head keys and value keys are different, repeat to force tensors into matching shapes
- if (num_k_heads != num_v_heads) {
+ // note: need explicit repeat only if we are not using the fused GDN
+ if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
- // TODO: try to avoid these explicit repeats by utilizing op broadcast
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
}
cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il);
- // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
- std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
- if (n_seq_tokens == 1) {
- attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
- } else {
- attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
- }
+ auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
cb(final_output, "final_output", il);
// Output projection
- cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+ cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
ggml_tensor * moe_out =
build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
nullptr,
- n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
- nullptr, model.layers[il].ffn_gate_up_exps);
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ nullptr, model.layers[il].ffn_gate_up_exps,
+ model.layers[il].ffn_up_exps_s,
+ model.layers[il].ffn_gate_exps_s,
+ model.layers[il].ffn_down_exps_s);
cb(moe_out, "ffn_moe_out", il);
// Add shared experts if present - following Qwen3Next reference implementation
if (model.layers[il].ffn_up_shexp != nullptr) {
ggml_tensor * ffn_shexp =
build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
+ model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
+ model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
+ model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
#include "models.h"
llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// self_attention
{
// compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ if (model.layers[il].wo_s) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+ }
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
+ il,
+ nullptr, nullptr,
+ model.layers[il].ffn_up_exps_s,
+ model.layers[il].ffn_gate_exps_s,
+ model.layers[il].ffn_down_exps_s);
cb(moe_out, "ffn_moe_out", il);
cur = moe_out;
ggml_tensor * cur,
ggml_tensor * inp_pos,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// if head keys and value keys are different, repeat to force tensors into matching shapes
+ // TODO: avoid repeats for fused GDN, needs broadcast configuration for GDN op [TAG_GGML_GDN_BCAST]
if (num_k_heads != num_v_heads) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
int64_t repeat_factor = num_v_heads / num_k_heads;
cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il);
- // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
- std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
- if (n_seq_tokens == 1) {
- attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
- } else {
- attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
- }
+ auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
// MoE branch
ggml_tensor * moe_out =
build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
nullptr,
- n_expert, n_expert_used, LLM_FFN_SILU,
- true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
nullptr, model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// RND1 is a Qwen3Moe AR model converted to diffusion model.
llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
nullptr,
n_expert, n_expert_used,
LLM_FFN_RELU, true,
- false, 0.0,
+ hparams.expert_weights_scale,
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
il, probs);
#include "models.h"
llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// RoPE (partial rotary factors per layer)
const bool is_swa = hparams.is_swa(il);
ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il);
- const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2);
+ const int64_t n_rot_l = hparams.n_rot(il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
cb(cur, "ffn_out", il);
} else {
// MoE routed experts
- const bool norm_w = hparams.expert_weights_norm;
- const float w_scale = hparams.expert_weights_scale;
- const bool scale_w = w_scale != 0.0f;
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
- LLM_FFN_SILU,
- norm_w, scale_w, w_scale,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
#include "models.h"
llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// tiny_aya digit grouping pattern from tokenizer.json:
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
// Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
- // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+ // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
}