scale = scale.float()
if block_size is not None:
+ dim_offset = scale.ndim - len(block_size)
for i, size in enumerate(block_size):
- scale = scale.repeat_interleave(size, i)
+ scale = scale.repeat_interleave(size, dim_offset + i)
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
scale = scale[tuple(slice(0, size) for size in weight.shape)]
+ # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1])
+ while scale.ndim < weight.ndim:
+ scale = scale.unsqueeze(-1)
+
return weight.float() * scale
# ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
elif quant_method == "fp8":
block_size = quant_config.get("weight_block_size")
for name in self.model_tensors.keys():
- if name.endswith(".weight_scale_inv"):
+ if name.endswith("_scale_inv"):
weight_name = name.removesuffix("_scale_inv")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
tensors_to_remove.append(name)
if name.endswith(".activation_scale"): # unused
tensors_to_remove.append(name)
+ if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
+ tensors_to_remove.append(name)
# mistral format
if name.endswith(".qscale_weight"):
weight_name = name.removesuffix("qscale_weight") + "weight"
def get_token_id(self, token: str) -> int:
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
- added_tokens_decoder = json.load(f)['added_tokens_decoder']
+ added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {}
for id_, token_data in added_tokens_decoder.items():
- if token_data["content"] == token:
+ if token_data.get("content") == token:
return int(id_)
+ # fallthrough to tokenizer.json
+ with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
+ for token_data in tokenizer_json["added_tokens"]:
+ if token_data["content"] == token:
+ return int(token_data["id"])
raise ValueError(f"Token '{token}' not found in tokenizer config.")
def set_gguf_parameters(self):
yield from super().modify_tensors(data_torch, name, bid)
-@ModelBase.register(
- "Mistral3ForConditionalGeneration",
- "Ministral3ForCausalLM",
-)
-class Mistral3Model(LlamaModel):
- model_arch = gguf.MODEL_ARCH.MISTRAL3
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- # for compatibility, we use LLAMA arch for older models
- # TODO: remove this once everyone has migrated to newer version of llama.cpp
- if self.hparams.get("model_type") != "ministral3":
- self.model_arch = gguf.MODEL_ARCH.LLAMA
- self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
- self.gguf_writer.add_architecture()
- self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- rope_params = self.rope_parameters
- if self.hparams.get("model_type") == "ministral3":
- assert rope_params, "ministral3 must have 'rope_parameters' config"
- assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
- self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
- self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
-
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
- name = name.replace("language_model.", "")
- if "multi_modal_projector" in name or "vision_tower" in name:
- return
-
- yield from super().modify_tensors(data_torch, name, bid)
-
-
@ModelBase.register("DeciLMForCausalLM")
class DeciModel(TextModel):
model_arch = gguf.MODEL_ARCH.DECI
# TODO @ngxson : remove this when we support MTP for deepseek models
skip_mtp = True
+ merge_expert = True
+
def set_vocab(self):
try:
self._set_vocab_gpt2()
return
# process the experts separately
- if name.find("mlp.experts") != -1:
+ if self.merge_expert and name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
raise ValueError(f"Unprocessed experts: {experts}")
+@ModelBase.register(
+ "Mistral3ForConditionalGeneration",
+ "Ministral3ForCausalLM",
+)
+class Mistral3Model(TextModel):
+ class Ministral3Model(LlamaModel):
+ model_arch = gguf.MODEL_ARCH.MISTRAL3
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ rope_params = self.rope_parameters
+ if self.hparams.get("model_type") == "ministral3":
+ assert rope_params, "ministral3 must have 'rope_parameters' config"
+ assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
+ self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+ name = name.replace("language_model.", "")
+ if "multi_modal_projector" in name or "vision_tower" in name:
+ return
+
+ yield from super().modify_tensors(data_torch, name, bid)
+
+ class Mistral4Model(DeepseekV2Model):
+ model_arch = gguf.MODEL_ARCH.MISTRAL4
+ skip_mtp = False # model contains no MTP layers, so no need to skip
+ merge_expert = False # experts are already stacked as 3D
+
+ def modify_tensors(self, data_torch, name, bid):
+ if name.endswith(".down_proj") or name.endswith(".gate_up_proj"):
+ name = name + ".weight"
+ yield from super().modify_tensors(data_torch, name, bid)
+
+ model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused
+ impl: TextModel
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ if self.hparams.get("model_type") == "mistral4":
+ self.impl = Mistral3Model.Mistral4Model(*args, **kwargs)
+ else:
+ self.impl = Mistral3Model.Ministral3Model(*args, **kwargs)
+
+ def set_vocab(self):
+ self.impl.set_vocab()
+
+ def set_gguf_parameters(self):
+ self.impl.set_gguf_parameters()
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+ yield from self.impl.modify_tensors(data_torch, name, bid)
+
+ def prepare_tensors(self):
+ self.impl.prepare_tensors()
+
+ def write_vocab(self):
+ self.impl.write_vocab()
+
+ def write(self):
+ self.impl.write()
+
+
@ModelBase.register("MiniMaxM2ForCausalLM")
class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
RND1 = auto()
PANGU_EMBED = auto()
MISTRAL3 = auto()
+ MISTRAL4 = auto()
PADDLEOCR = auto()
MIMO2 = auto()
STEP35 = auto()
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
+ MODEL_ARCH.MISTRAL4: "mistral4",
MODEL_ARCH.PADDLEOCR: "paddleocr",
MODEL_ARCH.MIMO2: "mimo2",
MODEL_ARCH.STEP35: "step35",
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
+ MODEL_ARCH.MISTRAL4: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_Q_A,
+ MODEL_TENSOR.ATTN_Q_B,
+ MODEL_TENSOR.ATTN_KV_A_MQA,
+ MODEL_TENSOR.ATTN_KV_B,
+ MODEL_TENSOR.ATTN_K_B,
+ MODEL_TENSOR.ATTN_V_B,
+ MODEL_TENSOR.ATTN_Q_A_NORM,
+ MODEL_TENSOR.ATTN_KV_A_NORM,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.ATTN_ROT_EMBD,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
+ ],
MODEL_ARCH.MIMO2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,