self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
+ rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
+ local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
- if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
+ if local_rope_theta is not None:
+ self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
+ if "rope_theta" not in self.rope_parameters and rope_theta is not None:
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
self.gguf_writer.add_head_count_kv(n_head_kv)
logger.info(f"gguf: key-value head count = {n_head_kv}")
+ # TODO: Handle "sliding_attention" similarly when models start implementing it
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
if (rope_type := rope_params.get("rope_type")) is not None:
rope_factor = rope_params.get("factor")
if (rope_theta := rope_params.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
logger.info(f"gguf: rope theta = {rope_theta}")
+ if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
+ self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
+ logger.info(f"gguf: rope theta swa = {local_rope_theta}")
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
self.gguf_writer.add_sliding_window(sliding_window)
self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
- self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
- self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"])
self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
- self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
- float rope_freq_base_train_swa;
+ float rope_freq_base_train_swa = 10000.0f;
float rope_freq_scale_train;
- float rope_freq_scale_train_swa;
+ float rope_freq_scale_train_swa = 1.0f;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
+ // TODO: Handle SWA metadata similarly when models start implementing it
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 0.0f;
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
}
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
// non-transformer models do not have attention heads
hparams.f_attn_temp_scale = 0.1f;
hparams.f_attn_temp_offset = 1.0f;
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
switch (hparams.n_expert) {
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
if (found_swa && hparams.n_swa > 0) {
uint32_t swa_period = 8;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.rope_freq_scale_train_swa = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
hparams.n_swa = 4096; // default value of gemma 2
hparams.set_swa_pattern(2);
hparams.attn_soft_cap = true;
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(6);
- hparams.rope_freq_base_train_swa = 10000.0f;
- hparams.rope_freq_scale_train_swa = 1.0f;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
hparams.set_swa_pattern(5);
hparams.n_layer_kv_from_start = 20;
- hparams.rope_freq_base_train_swa = 10000.0f;
- hparams.rope_freq_scale_train_swa = 1.0f;
hparams.f_attention_scale = 1.0f;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
hparams.set_swa_pattern(6);
hparams.causal_attn = false; // embeddings do not use causal attention
- hparams.rope_freq_base_train_swa = 10000.0f;
- hparams.rope_freq_scale_train_swa = 1.0f;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(2);
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_20B; break;
case 36: type = LLM_TYPE_120B; break;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4, true);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
hparams.n_no_rope_layer_step = hparams.n_layer;
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
+ }
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
ggml_tensor * inpSA = inpL;
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
+
// dual attention normalization (pre)
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
cb(Qcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il);
- // RoPE only for sliding_attention layers
- const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
- ((il + 1) % hparams.n_no_rope_layer_step) != 0;
if (use_rope) {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur_rope", il);
}
for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.is_swa(il);
+ // UNUSED:
+ // const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
ggml_tensor * inpSA = inpL;
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
(il + 1) % hparams.n_no_rope_layer_step != 0;
if (use_rope) {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
} else if (inp_attn_scale) {
auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) {
- float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
cur = inpL;
// RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
ggml_tensor * inpSA = inpL;
// norm
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
ggml_tensor * inpSA = inpL;
- ggml_tensor * probs = nullptr;
- probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+ const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
+ il % hparams.n_no_rope_layer_step != 0;
+
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
cb(probs, "ffn_moe_logits", il);
// norm
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ if (use_rope) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il);