s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
tensors_to_remove.append(name)
+ if name.endswith(".activation_scale"): # unused
+ tensors_to_remove.append(name)
+ # mistral format
+ if name.endswith(".qscale_weight"):
+ weight_name = name.removesuffix("qscale_weight") + "weight"
+ w = self.model_tensors[weight_name]
+ s = self.model_tensors[name]
+ self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
+ tensors_to_remove.append(name)
+ if name.endswith(".qscale_act"):
+ tensors_to_remove.append(name)
elif quant_method == "gptq":
for name in self.model_tensors.keys():
if name.endswith(".qweight"):
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
- # TODO: probably not worth supporting quantized weight, as official BF16 is also available
- if name.endswith("weight_scale_inv"):
- raise ValueError("This is a quantized weight, please use BF16 weight instead")
-
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return []
+
return super().modify_tensors(data_torch, name, bid)
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+ def dequant_model(self):
+ # transform quantization config into HF format
+ quant_config = self.hparams.get("quantization")
+ if quant_config is not None:
+ assert quant_config["qformat_weight"] == "fp8_e4m3"
+ self.hparams["quantization_config"] = {
+ "activation_scheme": "static",
+ "quant_method": "fp8",
+ "weight_block_size": None,
+ }
+ return super().dequant_model()
+
@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg