# consistency
if attention_scale := self.hparams.get("attention_multiplier"):
self.gguf_writer.add_attention_scale(attention_scale)
+ logger.info("gguf: (granite) attention_scale = %s", attention_scale)
if embedding_scale := self.hparams.get("embedding_multiplier"):
self.gguf_writer.add_embedding_scale(embedding_scale)
+ logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
if residual_scale := self.hparams.get("residual_multiplier"):
self.gguf_writer.add_residual_scale(residual_scale)
- if logits_scaling := self.hparams.get("logits_scaling"):
- self.gguf_writer.add_logit_scale(logits_scaling)
+ logger.info("gguf: (granite) residual_scale = %s", residual_scale)
+ if logits_scale := self.hparams.get("logits_scaling"):
+ self.gguf_writer.add_logit_scale(logits_scale)
+ logger.info("gguf: (granite) logits_scale = %s", logits_scale)
+
+
+@Model.register("GraniteMoeForCausalLM")
+class GraniteMoeModel(GraniteModel):
+ """Conversion for IBM's GraniteMoeForCausalLM"""
+ model_arch = gguf.MODEL_ARCH.GRANITE_MOE
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ """In modeling_granitemoe, the JetMoe implementation of parallel experts
+ is used. This essentially merges w1 and w3 into a single tensor with 2x
+ the hidden size that is then split during forward. To keep compatibility
+ with existing mixtral support, we pull them apart here.
+ """
+
+ if name.endswith("block_sparse_moe.input_linear.weight"):
+ ffn_dim = self.hparams["intermediate_size"]
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
+ gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+ return [
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
+ ]
+
+ return super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ######
+
# tree of lazy tensors
class LazyTorchTensor(gguf.LazyBase):
_tensor_type = torch.Tensor
NEMOTRON = auto()
EXAONE = auto()
GRANITE = auto()
+ GRANITE_MOE = auto()
class MODEL_TENSOR(IntEnum):
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite",
+ MODEL_ARCH.GRANITE_MOE: "granitemoe",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_ARCH.GRANITE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
+ MODEL_ARCH.GRANITE_MOE: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ ],
# TODO
}
),
MODEL_TENSOR.FFN_GATE_INP: (
- "layers.{bid}.feed_forward.gate", # mixtral
- "model.layers.{bid}.block_sparse_moe.gate", # mixtral
- "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
- "transformer.decoder_layer.{bid}.router", # Grok
- "transformer.blocks.{bid}.ffn.router.layer", # dbrx
+ "layers.{bid}.feed_forward.gate", # mixtral
+ "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+ "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
+ "transformer.decoder_layer.{bid}.router", # Grok
+ "transformer.blocks.{bid}.ffn.router.layer", # dbrx
+ "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
),
MODEL_TENSOR.FFN_DOWN_EXP: (
- "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
- "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
- "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
- "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
+ "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
+ "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
+ "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
+ "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
),
MODEL_TENSOR.FFN_DOWN_SHEXP: (
LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6,
LLM_ARCH_GRANITE,
+ LLM_ARCH_GRANITE_MOE,
LLM_ARCH_UNKNOWN,
};
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_GRANITE, "granite" },
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GRANITE_MOE,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ },
+ },
{
LLM_ARCH_UNKNOWN,
{
float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f;
- // Additional scale factors (Granite)
+ // Additional scale factors (Granite/Granite MoE)
float f_residual_scale = 0.0f;
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;
}
} break;
case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
switch (hparams.n_layer) {
+ case 32: model.type = e_model::MODEL_3B; break;
case 40: model.type = e_model::MODEL_3B; break;
// Add additional layer/vocab/etc checks here for other model sizes
default: model.type = e_model::MODEL_UNKNOWN;
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
- if (model.arch == LLM_ARCH_GRANITE) {
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
case LLM_ARCH_REFACT:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
switch (model.arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
{
result = llm.build_llama();
} break;
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2