ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
cb(weights_sum, "ffn_moe_weights_sum", il);
- if (arch == LLM_ARCH_BAILINGMOE2) {
- weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
- cb(weights_sum, "ffn_moe_weights_sum_biased", il);
- }
+ // Avoid division by zero, clamp to smallest number representable by F16
+ weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
+ cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights_norm", il);