self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")
- if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
+ if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
raise ValueError(f"Unprocessed experts: {experts}")
+@ModelBase.register("Step3p5ForCausalLM")
+class Step35Model(TextModel):
+ model_arch = gguf.MODEL_ARCH.STEP35
+
+ def set_gguf_parameters(self):
+ rope_theta = self.hparams.get("rope_theta")
+ if isinstance(rope_theta, list):
+ self.hparams["rope_theta"] = float(rope_theta[0])
+ self.hparams["local_rope_theta"] = float(rope_theta[1])
+ self.rope_parameters["rope_theta"] = self.hparams["rope_theta"]
+ self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]}
+
+ super().set_gguf_parameters()
+
+ layer_types = self.hparams.get("layer_types") or []
+ partial_rotary_factors = self.hparams.get("partial_rotary_factors") or []
+ attn_other = self.hparams.get("attention_other_setting") or {}
+
+ n_head_base = self.hparams["num_attention_heads"]
+ n_kv_base = self.hparams["num_attention_groups"]
+
+ n_head_swa = attn_other.get("num_attention_heads", n_head_base)
+ n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
+
+ layer_types = layer_types[: self.block_count]
+ partial_rotary_factors = partial_rotary_factors[: self.block_count]
+ assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
+ head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
+ kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
+ swa_pat = [lt == "sliding_attention" for lt in layer_types]
+
+ self.gguf_writer.add_head_count(head_arr)
+ self.gguf_writer.add_head_count_kv(kv_arr)
+
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+ self.gguf_writer.add_sliding_window_pattern(swa_pat)
+
+ self.gguf_writer.add_value_length(self.hparams["head_dim"])
+
+ # MoE params
+ self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
+ self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"])
+
+ if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None:
+ self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor)
+ if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None:
+ self.gguf_writer.add_expert_weights_norm(norm_expert_weight)
+
+ # leading dense blocks
+ leading_dense = 0
+ moe_layers_enum = self.hparams.get("moe_layers_enum")
+ if isinstance(moe_layers_enum, str) and moe_layers_enum.strip():
+ moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(","))
+ if moe_layers:
+ leading_dense = max(0, moe_layers[0])
+ self.gguf_writer.add_leading_dense_block_count(leading_dense)
+ self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1)))
+
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
+
+ # Optional per-layer SwiGLU clamps.
+ if (limits := self.hparams.get("swiglu_limits")) is not None:
+ limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
+ self.gguf_writer.add_swiglu_clamp_exp(limits_f)
+ if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
+ limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
+ self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+ # remove mtp layers
+ if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
+ il = int(m.group(1))
+ n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
+ if il >= n_main:
+ return
+ if name.endswith("norm.weight"):
+ data_torch += 1.0
+ # Map router bias (expert selection bias) to a GGUF bias tensor
+ if name.endswith(".moe.router_bias"):
+ name += ".bias"
+
+ if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")):
+ data_torch = data_torch.squeeze().contiguous()
+
+ yield from super().modify_tensors(data_torch, name, bid)
+
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+ # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
+ # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
+ rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
+ rope_type = rope_params.get("rope_type") or ""
+ if rope_type.lower() != "llama3":
+ return
+
+ # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value.
+ rope_theta = self.hparams.get("rope_theta", 10000.0)
+ if isinstance(rope_theta, list):
+ rope_theta = rope_theta[0]
+ base = float(rope_theta)
+ if (dim := self.hparams.get("head_dim")) is None:
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+ dim = int(dim)
+
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+ factor = float(rope_params.get("factor", 8.0))
+ low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
+ high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
+ old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
+
+ low_freq_wavelen = old_context_len / low_freq_factor
+ high_freq_wavelen = old_context_len / high_freq_factor
+
+ rope_factors: list[float] = []
+ for freq in freqs:
+ wavelen = 2 * math.pi / float(freq)
+ if wavelen < high_freq_wavelen:
+ rope_factors.append(1.0)
+ elif wavelen > low_freq_wavelen:
+ rope_factors.append(factor)
+ else:
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+ rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
+
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+
@ModelBase.register("PanguEmbeddedForCausalLM")
class PanguEmbeddedModel(TextModel):
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
+ SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp"
+ SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp"
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
class Rope:
- DIMENSION_COUNT = "{arch}.rope.dimension_count"
- DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
- FREQ_BASE = "{arch}.rope.freq_base"
- FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
- SCALING_TYPE = "{arch}.rope.scaling.type"
- SCALING_FACTOR = "{arch}.rope.scaling.factor"
- SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
- SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
- SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
- SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
- SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor"
- SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor"
- SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast"
- SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow"
+ DIMENSION_COUNT = "{arch}.rope.dimension_count"
+ DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
+ FREQ_BASE = "{arch}.rope.freq_base"
+ FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
+ SCALING_TYPE = "{arch}.rope.scaling.type"
+ SCALING_FACTOR = "{arch}.rope.scaling.factor"
+ SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
+ SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
+ SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
+ SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
+ SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor"
+ SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor"
+ SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast"
+ SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow"
class Split:
LLM_KV_SPLIT_NO = "split.no"
PANGU_EMBED = auto()
MISTRAL3 = auto()
MIMO2 = auto()
+ STEP35 = auto()
LLAMA_EMBED = auto()
MAINCODER = auto()
KIMI_LINEAR = auto()
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
MODEL_ARCH.MIMO2: "mimo2",
+ MODEL_ARCH.STEP35: "step35",
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
MODEL_ARCH.MAINCODER: "maincoder",
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
+ MODEL_ARCH.STEP35: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_Q_NORM,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_K_NORM,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_GATE,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
+ ],
MODEL_ARCH.LLAMA_EMBED: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
# RoPE
-KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
-KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
-KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
-KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
-KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
-KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
+KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
+KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
+KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
+KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
+KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
+KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
# SSM
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
+ def add_swiglu_clamp_exp(self, values: Sequence[float]) -> None:
+ self.add_array(Keys.LLM.SWIGLU_CLAMP_EXP.format(arch=self.arch), values)
+
+ def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None:
+ self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values)
+
def add_expert_group_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)
MODEL_TENSOR.ATTN_GATE: (
"model.layers.{bid}.self_attn.gate_proj", # afmoe
+ "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate
),
# Feed-forward norm
"model.layers.{bid}.mlp.router.gate", # afmoe
"layers.{bid}.gate", # mistral-large
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
+ "model.layers.{bid}.moe.gate", # step3.5
),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
"backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe
"model.layers.{bid}.mlp.e_score_correction", # exaone-moe
"model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi
+ "model.layers.{bid}.moe.router_bias", # step3.5 expert selection bias
),
# Feed-forward up
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
"model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker
+ "model.layers.{bid}.moe.up_proj", # step3.5
),
MODEL_TENSOR.FFN_UP_SHEXP: (
"layers.{bid}.shared_experts.w3", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
"model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
+ "model.layers.{bid}.share_expert.up_proj", # step3.5
),
MODEL_TENSOR.FFN_UP_CHEXP: (
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
"model.layers.{bid}.block_sparse_moe.experts.gate", # smallthinker
+ "model.layers.{bid}.moe.gate_proj", # step3.5
),
MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan
"layers.{bid}.shared_experts.w1", # mistral-large
"model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
+ "model.layers.{bid}.share_expert.gate_proj", # step3.5
),
MODEL_TENSOR.FFN_GATE_CHEXP: (
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
+ "model.layers.{bid}.moe.down_proj", # step3.5
),
MODEL_TENSOR.FFN_DOWN_SHEXP: (
"layers.{bid}.shared_experts.w2", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
"model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
+ "model.layers.{bid}.share_expert.down_proj", # step3.5
),
MODEL_TENSOR.FFN_DOWN_CHEXP: (
models/stablelm.cpp
models/starcoder.cpp
models/starcoder2.cpp
+ models/step35-iswa.cpp
models/t5-dec.cpp
models/t5-enc.cpp
models/wavtokenizer-dec.cpp
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
- { LLM_ARCH_MIMO2, "mimo2" },
+ { LLM_ARCH_MIMO2, "mimo2" },
+ { LLM_ARCH_STEP35, "step35" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
+ { LLM_KV_SWIGLU_CLAMP_EXP, "%s.swiglu_clamp_exp" },
+ { LLM_KV_SWIGLU_CLAMP_SHEXP, "%s.swiglu_clamp_shexp" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
- { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
- { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
- { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
- { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
- { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
- { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
+ { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
+ { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
+ { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
+ { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
+ { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
};
+ case LLM_ARCH_STEP35:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
case LLM_ARCH_GPTJ:
case LLM_ARCH_UNKNOWN:
return {
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
LLM_ARCH_MIMO2,
+ LLM_ARCH_STEP35,
LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_KIMI_LINEAR,
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
+ LLM_KV_SWIGLU_CLAMP_EXP,
+ LLM_KV_SWIGLU_CLAMP_SHEXP,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
#include <cassert>
#include <cmath>
#include <cstring>
+#include <numeric>
+#include <sstream>
#include <unordered_set>
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
switch (type_op) {
case LLM_FFN_SILU:
if (gate && type_gate == LLM_FFN_PAR) {
+ // Step35: HF clamps gate (after SiLU) and up before multiplication
+ if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_shexp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_silu_clamped", il);
+
+ tmp = ggml_clamp(ctx0, tmp, -limit, limit);
+ cb(tmp, "ffn_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, tmp);
+ cb(cur, "ffn_swiglu_limited", il);
+ type_gate = LLM_FFN_SEQ;
+ break;
+ }
+ }
+
cur = ggml_swiglu_split(ctx0, cur, tmp);
cb(cur, "ffn_swiglu", il);
type_gate = LLM_FFN_SEQ;
switch (type_op) {
case LLM_FFN_SILU:
if (gate_exps) {
+ // Step35: per-layer clamp for routed experts
+ if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_exp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_moe_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_moe_silu_clamped", il);
+
+ up = ggml_clamp(ctx0, up, -limit, limit);
+ cb(up, "ffn_moe_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, up);
+ cb(cur, "ffn_moe_swiglu_limited", il);
+ break;
+ }
+ }
+
cur = ggml_swiglu_split(ctx0, cur, up);
cb(cur, "ffn_moe_swiglu", il);
} else {
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
+
+ // Step35: optional per-layer clamps for (Swi)GLU
+ std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
+ std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
+
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
// dense_first means whether the pattern is start with a dense layer
// note that if n_pattern == 0, all layers are SWA
}
bool llama_kv_cache_iswa::get_can_shift() const {
- return kv_base->get_size() == kv_swa->get_size();
+ return kv_base->get_can_shift() &&
+ kv_swa->get_can_shift() &&
+ kv_base->get_size() == kv_swa->get_size();
}
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
}
bool llama_kv_cache::get_can_shift() const {
+ // Step35 uses per-layer RoPE dims; K-shift assumes a single global n_rot.
+ if (model.arch == LLM_ARCH_STEP35) {
+ return false;
+ }
return true;
}
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
+ case LLM_TYPE_196B_A11B: return "196B.A11B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
+ std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
+ std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_STEP35:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+ // MoE + SWA parameters
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+
+ // Step35 uses sigmoid gating by default (if not set in GGUF)
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
+
+ switch (hparams.n_layer) {
+ case 45: type = LLM_TYPE_196B_A11B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
default: throw std::runtime_error("unsupported model architecture");
}
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
} break;
+ case LLM_ARCH_STEP35:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
+ // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
+ uint32_t n_rot_max = 0;
+ for (int i = 0; i < n_layer; ++i) {
+ n_rot_max = std::max(n_rot_max, hparams.n_rot);
+ }
+ if (n_rot_max == 0) {
+ n_rot_max = n_rot;
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ const uint32_t n_head_l = hparams.n_head(i);
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+
+ // optional rope factors (llama3) / longrope tensors
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ } else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
+
+ // head-wise attention gate (Step35 self_attn.g_proj)
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ // dense MLP (leading dense blocks)
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ // MoE routed experts + selection bias (router_bias)
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+ // shared expert MLP
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
case LLM_ARCH_MAINCODER:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
{
llm = std::make_unique<llm_build_kimi_linear>(*this, params);
} break;
+ case LLM_ARCH_STEP35:
+ {
+ llm = std::make_unique<llm_build_step35_iswa>(*this, params);
+ } break;
default:
GGML_ABORT("fatal error");
}
case LLM_ARCH_AFMOE:
case LLM_ARCH_QWEN3NEXT:
case LLM_ARCH_MIMO2:
+ case LLM_ARCH_STEP35:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
+ LLM_TYPE_196B_A11B, // Step3.5-Flash
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_step35_iswa : public llm_graph_context {
+ llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
struct llm_build_t5_dec : public llm_graph_context {
llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
};
--- /dev/null
+#include "models.h"
+
+llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ const uint32_t n_head_l = hparams.n_head(il);
+ const uint32_t n_head_kv_l = hparams.n_head_kv(il);
+
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ cur = inpL;
+
+ // dump pre-attn RMSNorm input to pinpoint layer boundary issues
+ cb(cur, "attn_norm_in", il);
+
+ // self-attention
+ {
+ cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
+
+ // Q/K per-head RMSNorm (Step35 q_norm / k_norm)
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+
+ // RoPE (partial rotary factors per layer)
+ const bool is_swa = hparams.is_swa(il);
+ ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il);
+ const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
+ ggml_tensor * attn_out = build_attn(inp_attn,
+ nullptr, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(attn_out, "attn_out", il);
+ // head-wise attention gate: sigmoid(g_proj(x)) in torch
+ if (model.layers[il].wqkv_gate) {
+ ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens]
+ cb(gate, "attn_gate", il);
+
+ gate = ggml_sigmoid(ctx0, gate);
+ cb(gate, "attn_gate_sigmoid", il);
+
+ // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens]
+ ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
+ ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
+ cb(gate_3d, "attn_gate_3d", il);
+
+ attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
+ cb(attn_3d, "attn_gated_3d", il);
+
+ attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
+ cb(attn_out, "attn_gated", il);
+ }
+
+ // output projection
+ cur = build_lora_mm(model.layers[il].wo, attn_out);
+ cb(cur, "attn_proj", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense MLP
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
+ nullptr,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE routed experts
+ const bool norm_w = hparams.expert_weights_norm;
+ const float w_scale = hparams.expert_weights_scale;
+ const bool scale_w = w_scale != 0.0f;
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU,
+ norm_w, scale_w, w_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // shared expert MLP (always added on MoE layers in Step35)
+ ggml_tensor * sh_out = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, nullptr, nullptr,
+ model.layers[il].ffn_gate_shexp, nullptr, nullptr,
+ model.layers[il].ffn_down_shexp, nullptr, nullptr,
+ nullptr,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(sh_out, "ffn_shared_out", il);
+
+ cur = ggml_add(ctx0, moe_out, sh_out);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}