From: compilade Date: Fri, 8 Aug 2025 21:48:26 +0000 (-0400) Subject: gguf-py : add Numpy MXFP4 de/quantization support (llama/15111) X-Git-Tag: upstream/1.8.0~286 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=62566a54365795cd509d8075cd2ea706d491d72f;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp gguf-py : add Numpy MXFP4 de/quantization support (llama/15111) * gguf-py : add MXFP4 de/quantization support * ggml-quants : handle zero amax for MXFP4 --- diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index a57d2a16..94f6405c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -288,7 +288,7 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE } } - const uint8_t e = (uint8_t) (floorf(log2f(amax)) - 2 + 127); + const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0; const float d = GGML_E8M0_TO_FP32_HALF(e);