fout.write(struct.pack("i", hparams["num_attention_heads"]))
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
-fout.write(struct.pack("i", hparams["use_parallel_residual"]))
+fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True))
fout.write(struct.pack("i", ftype))
# TODO: temporary hack to not deal with implementing the tokenizer