# This logic matches modeling_plamo.py's is_mamba function
mamba_step = hparams.get("mamba_step", 2)
mamba_enabled = hparams.get("mamba_enabled", True)
- mamba_layers = []
+ num_key_value_heads = []
+ num_attention_heads = []
if mamba_enabled:
for i in range(block_count):
else:
is_mamba = (i % mamba_step) != (mamba_step // 2)
if is_mamba:
- mamba_layers.append(0)
+ num_key_value_heads.append(0)
+ num_attention_heads.append(0)
else:
- mamba_layers.append(hparams.get("num_key_value_heads", 4))
+ num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
+ num_attention_heads.append(hparams.get("num_attention_heads", 32))
- if mamba_layers:
- self.gguf_writer.add_head_count_kv(mamba_layers)
+ if num_key_value_heads and num_attention_heads:
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
+ self.gguf_writer.add_head_count(num_attention_heads)
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
+ self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
+ self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
}
break;
default: type = LLM_TYPE_UNKNOWN;
- }
+ }
+
+ // Load attention parameters
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
} break;
case LLM_ARCH_GPT2:
{
} break;
case LLM_ARCH_PLAMO2:
{
+ // mamba parameters
const uint32_t d_conv = hparams.ssm_d_conv;
const uint32_t d_state = hparams.ssm_d_state;
const uint32_t num_heads = hparams.ssm_dt_rank;
const uint32_t intermediate_size = hparams.ssm_d_inner;
- const uint32_t head_dim = intermediate_size / num_heads;
- const uint32_t qk_dim = head_dim;
- const uint32_t v_dim = head_dim;
- const int64_t num_attention_heads = hparams.n_head();
- const int64_t q_num_heads = num_attention_heads;
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
+ // attention parameters
+ const uint32_t qk_dim = hparams.n_embd_head_k;
+ const uint32_t v_dim = hparams.n_embd_head_v;
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
} else {
+ const int64_t num_attention_heads = hparams.n_head(i);
+ const int64_t q_num_heads = num_attention_heads;
const int64_t num_key_value_heads = hparams.n_head_kv(i);
const int64_t k_num_heads = num_key_value_heads;
const int64_t v_num_heads = num_key_value_heads;
const int64_t v_proj_dim = v_num_heads * v_dim;
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
}
const int64_t n_embd_head_q = hparams.n_embd_head_k;
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ int32_t n_head = hparams.n_head(il);
int32_t n_head_kv = hparams.n_head_kv(il);
const int64_t q_offset = 0;