self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")
+ if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
+ if score_func == "sigmoid":
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+ elif score_func == "softmax":
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+ else:
+ raise ValueError(f"Unsupported expert score gating function value: {score_func}")
+ logger.info(f"gguf: expert score gating function = {score_func}")
+
if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
- # Expert Gating Function
- score_func = self.hparams.get("score_func")
- if score_func == "sigmoid":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- elif score_func == "softmax":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
- elif score_func is not None:
- raise ValueError(f"Unsupported score_function value: {score_func}")
-
# Route normalization and scaling
if (route_norm := self.hparams.get("route_norm")) is not None:
self.gguf_writer.add_expert_weights_norm(route_norm)
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
- if hparams["scoring_func"] == "sigmoid":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- elif hparams["scoring_func"] == "softmax":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
- else:
- raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
-
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
rope_scaling = self.hparams.get("rope_scaling") or {}
def set_gguf_parameters(self):
super().set_gguf_parameters()
- if self.hparams["scoring_func"] == "sigmoid":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- elif self.hparams["scoring_func"] == "softmax":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
- else:
- raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
- if self.hparams["scoring_func"] == "noaux_tc":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- else:
- raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
- if hparams["score_function"] == "sigmoid":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- elif hparams["score_function"] == "softmax":
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
- else:
- raise ValueError(f"Unsupported score_function value: {hparams['score_function']}")
-
if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
self.gguf_writer.add_nextn_predict_layers(nextn_layers)