TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
+ KEY_LENGTH_SWA = "{arch}.attention.key_length_swa"
+ VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa"
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
+ DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa"
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
FREQ_BASE = "{arch}.rope.freq_base"
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
def add_value_length_mla(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
+ def add_key_length_swa(self, length: int) -> None:
+ self.add_uint32(Keys.Attention.KEY_LENGTH_SWA.format(arch=self.arch), length)
+
+ def add_value_length_swa(self, length: int) -> None:
+ self.add_uint32(Keys.Attention.VALUE_LENGTH_SWA.format(arch=self.arch), length)
+
def add_indexer_head_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count)
def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
+ def add_rope_dimension_count_swa(self, count: int) -> None:
+ self.add_uint32(Keys.Rope.DIMENSION_COUNT_SWA.format(arch=self.arch), count)
+
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
+ { LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+ LLM_KV_ATTENTION_KEY_LENGTH_SWA,
+ LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_DIMENSION_COUNT_SWA,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
- if (model->hparams.n_embd_head_k % blck_size != 0) {
- LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
- __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
- return nullptr;
+ for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+ if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+ __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
+ return nullptr;
+ }
}
}
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
- if (model->hparams.n_embd_head_v % blck_size != 0) {
- LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
- __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
- return nullptr;
+ for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+ if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
+ __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
+ return nullptr;
+ }
}
}
ubatch (params.ubatch),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
- n_rot (hparams.n_rot),
+ n_rot (hparams.n_rot()),
n_ctx (cparams.n_ctx),
n_head (hparams.n_head()),
n_head_kv (hparams.n_head_kv()),
- n_embd_head_k (hparams.n_embd_head_k),
+ n_embd_head_k (hparams.n_embd_head_k()),
n_embd_k_gqa (hparams.n_embd_k_gqa()),
- n_embd_head_v (hparams.n_embd_head_v),
+ n_embd_head_v (hparams.n_embd_head_v()),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
return n_head/n_head_kv;
}
+uint32_t llama_hparams::n_rot(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_rot_swa : n_rot_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
uint32_t llama_hparams::n_embd_inp() const {
uint32_t n_embd_inp = n_embd;
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
+uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
+ if (il < n_layer) {
+ return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
+ }
+
+ GGML_ABORT("fatal error");
+}
+
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
- return n_embd_head_k * n_head_kv;
+ return n_embd_head_k(il) * n_head_kv;
}
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
- return n_embd_head_v * n_head_kv;
+ return n_embd_head_v(il) * n_head_kv;
}
bool llama_hparams::is_n_embd_k_gqa_variable() const {
}
uint32_t llama_hparams::n_embd_head_k_mla() const {
- return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+ return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
}
uint32_t llama_hparams::n_embd_head_v_mla() const {
- return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+ return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
}
bool llama_hparams::has_kv(uint32_t il) const {
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
- uint32_t n_rot;
- uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
- uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ // different head size for full_attention and SWA layers
+ uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+ uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
+ uint32_t n_embd_head_k_swa;
+ uint32_t n_embd_head_v_swa;
+
+ // different RoPE dimensions for full_attention and SWA layers
+ uint32_t n_rot_full;
+ uint32_t n_rot_swa;
+
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla_impl = 0;
uint32_t n_embd_head_v_mla_impl = 0;
uint32_t n_gqa(uint32_t il = 0) const;
+ uint32_t n_rot(uint32_t il = 0) const;
+
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
+ // dimension of key/value embeddings for each head (per layer)
+ uint32_t n_embd_head_k(uint32_t il = 0) const;
+ uint32_t n_embd_head_v(uint32_t il = 0) const;
+
// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
return ggml_view_4d(ctx, k,
- hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
- ggml_row_size(k->type, hparams.n_embd_head_k),
+ hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(k->type, hparams.n_embd_head_k(il)),
ggml_row_size(k->type, n_embd_k_gqa),
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
if (!v_trans) {
// note: v->nb[1] <= v->nb[2]
return ggml_view_4d(ctx, v,
- hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
- ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+ hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
// note: v->nb[1] > v->nb[2]
return ggml_view_4d(ctx, v,
- n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
- ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, kv_size), // v->nb[2]
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
- float freq_scale) const {
+ float freq_scale,
+ uint32_t il) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
- const auto & n_rot = hparams.n_rot;
+ const auto & n_rot = hparams.n_rot(il);
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
- const auto & n_embd_head_k = hparams.n_embd_head_k;
- //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
- const auto & n_rot = hparams.n_rot;
-
- const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
-
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const auto n_rot = hparams.n_rot(il);
+ const auto n_embd_head_k = hparams.n_embd_head_k(il);
+ const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope));
- ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+ ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
ggml_build_forward_expand(gf, cur);
}
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
- float freq_scale) const;
+ float freq_scale,
+ uint32_t il) const;
ggml_cgraph * build_graph_shift(
llm_graph_result * res,
} break;
case GGML_OP_ROPE:
{
- const int n_embd_head = hparams.n_embd_head_v;
+ const int n_embd_head = hparams.n_embd_head_v();
const int n_head = hparams.n_head();
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
- add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
- add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
- add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+ hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+ hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
// sanity check for n_rot (optional)
- hparams.n_rot = hparams.n_embd_head_k;
+ hparams.n_rot_full = hparams.n_embd_head_k_full;
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
- if (hparams.n_rot != hparams.n_embd_head_k) {
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+ if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
}
}
} else {
- hparams.n_rot = 0;
- hparams.n_embd_head_k = 0;
- hparams.n_embd_head_v = 0;
+ hparams.n_rot_full = 0;
+ hparams.n_embd_head_k_full = 0;
+ hparams.n_embd_head_v_full = 0;
+ }
+
+ // head size and n_rot for SWA layers
+ {
+ hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
+ hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
+
+ hparams.n_rot_swa = hparams.n_rot_full;
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
}
// for differentiating model types
break;
default: type = LLM_TYPE_UNKNOWN;
}
-
- // Load attention parameters
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
} break;
case LLM_ARCH_PLAMO3:
{
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
hparams.f_attention_scale = type == LLM_TYPE_27B
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_GEMMA3:
{
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
hparams.f_attention_scale = type == LLM_TYPE_27B
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_GEMMA3N:
{
case 24: type = LLM_TYPE_0_3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
- hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
} break;
case LLM_ARCH_STARCODER2:
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ // full_attention layer only use half of the RoPE dimensions
+ hparams.n_rot_full = hparams.n_rot_full / 2;
+
// MoE + SWA parameters
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_token_types = vocab.n_token_types();
- const int64_t n_rot = hparams.n_rot;
+ const int64_t n_rot = hparams.n_rot();
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
const int64_t n_ctx_train = hparams.n_ctx_train;
} break;
case LLM_ARCH_MINICPM3:
{
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
// attention parameters
- const uint32_t qk_dim = hparams.n_embd_head_k;
- const uint32_t v_dim = hparams.n_embd_head_v;
+ const uint32_t qk_dim = hparams.n_embd_head_k();
+ const uint32_t v_dim = hparams.n_embd_head_v();
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
} break;
case LLM_ARCH_PLAMO3:
{
- const int64_t head_dim_q = hparams.n_embd_head_k;
- const int64_t head_dim_v = hparams.n_embd_head_v;
+ const int64_t head_dim_q = hparams.n_embd_head_k();
+ const int64_t head_dim_v = hparams.n_embd_head_v();
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
} break;
case LLM_ARCH_SEED_OSS:
{
- const uint32_t head_dim = hparams.n_embd_head_k;
+ const uint32_t head_dim = hparams.n_embd_head_k();
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
GGML_ASSERT(n_embd_head_qk_nope >= 1);
} break;
case LLM_ARCH_PLM:
{
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const int64_t kv_lora_rank = hparams.n_lora_kv;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
- const int64_t head_dim = hparams.n_embd_head_k;
+ const int64_t head_dim = hparams.n_embd_head_k();
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
- const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
+ const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
uint32_t n_rot_max = 0;
for (int i = 0; i < n_layer; ++i) {
- n_rot_max = std::max(n_rot_max, hparams.n_rot);
+ n_rot_max = std::max(n_rot_max, hparams.n_rot());
}
if (n_rot_max == 0) {
n_rot_max = n_rot;
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa);
+ LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa);
+ LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa);
}
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
#include "models.h"
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include <float.h>
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * inpL;
ggml_tensor * cur;
#include "models.h"
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;
#include "models.h"
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
//copied from qwen2
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
+ const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
- n_embd_head(model.hparams.n_embd_head_k),
+ n_embd_head(model.hparams.n_embd_head_k()),
n_embd_altup(model.hparams.n_embd_altup),
n_altup(model.hparams.n_altup),
i_altup_act(model.hparams.i_altup_act) {
#include "models.h"
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
#include "models.h"
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
const llm_graph_params & params)
: llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// JAIS-2 model graph builder
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
const int64_t kv_lora_rank = hparams.n_lora_kv;
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
- const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
+ const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
// Attention scale for MLA
const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
inp_attn_type * inp_attn,
int il) -> ggml_tensor * {
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
- const auto n_embd_head = hparams.n_embd_head_v;
+ const auto n_embd_head = hparams.n_embd_head_v();
const auto n_head_kv = hparams.n_head_kv(il);
auto * q = build_lora_mm(model.layers[il].wq, cur);
#include "models.h"
llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool embed>
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const uint32_t kv_lora_rank = hparams.n_lora_kv;
LLM_NORM_RMS, il);
cb(q, "q", il);
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+ // {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
#include "models.h"
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ // GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- //GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ //GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * attn_norm_output;
template<bool iswa>
llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
cb(qkv, "wqkv", il);
// split QKV tensor into Q, K, V
- const int64_t n_embd_head_q = hparams.n_embd_head_k;
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_embd_head_q = hparams.n_embd_head_k();
+ const int64_t n_embd_head_k = hparams.n_embd_head_k();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v();
int32_t n_head = hparams.n_head(il);
int32_t n_head_kv = hparams.n_head_kv(il);
template <bool iswa>
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- const int64_t head_dim_q = hparams.n_embd_head_k;
- const int64_t head_dim_v = hparams.n_embd_head_v;
+ const int64_t head_dim_q = hparams.n_embd_head_k();
+ const int64_t head_dim_v = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
#include "models.h"
llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const uint32_t kv_lora_rank = hparams.n_lora_kv;
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, hparams.n_embd_head_k()),
+ ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
v_states = ggml_cont(ctx0, v_states);
cb(v_states, "v_states", il);
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens,
+ ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head),
0);
cb(v_states, "v_states", il);
llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
llm_build_delta_net_base(params), model(model) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * inp_pos,
int * sections,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
llm_build_delta_net_base(params), model(model) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * inp_pos,
int * sections,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
#include "models.h"
llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * cur,
ggml_tensor * inp_pos,
int il) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ const int64_t n_embd_head = hparams.n_embd_head_v();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
// RND1 is a Qwen3Moe AR model converted to diffusion model.
llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
template <bool iswa>
llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
// RoPE (partial rotary factors per layer)
const bool is_swa = hparams.is_swa(il);
ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il);
- const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2);
+ const int64_t n_rot_l = hparams.n_rot(il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
#include "models.h"
llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
#include "models.h"
llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_head = hparams.n_embd_head_v();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+ GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;