}
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+ // (optional) temperature tuning - used by mistral-large
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
+
switch (hparams.n_layer) {
case 27: type = LLM_TYPE_16B; break;
case 60: type = LLM_TYPE_236B; break;
// {n_embd, n_tokens}
inpL = build_inp_embd(model.tok_embd);
+ // (optional) temperature tuning - used by mistral-large
+ ggml_tensor * inp_attn_scale = nullptr;
+ if (hparams.f_attn_temp_scale != 0.0f) {
+ inp_attn_scale = build_inp_attn_scale();
+ }
+
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * Vcur = kv_cmpr;
cb(Vcur, "Vcur", il);
+ if (inp_attn_scale) {
+ // apply llama 4 temperature scaling
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
+ }
+
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
cb(Kcur, "Kcur", il);
+ if (inp_attn_scale) {
+ // apply llama 4 temperature scaling
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
+ }
+
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,