ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
case LLM_ARCH_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 3:
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 12:
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
hparams.f_max_alibi_bias = 8.0f;
switch (hparams.n_layer) {
case LLM_ARCH_JINA_BERT_V3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 24:
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
case LLM_ARCH_NEO_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 28) {
type = LLM_TYPE_250M;
case LLM_ARCH_EUROBERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 12) {
type = LLM_TYPE_SMALL; // 0.2B
// fall through
case LLM_ARCH_QWEN2:
{
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
} break;
case LLM_ARCH_QWEN3:
{
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
//applied only if model converted with --sentence-transformers-dense-modules
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
} break;
case LLM_ARCH_BAILINGMOE:
{