metal : disable fast math in all quantize kernels (#14528)

author Georgi Gerganov <redacted>

Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)

committer GitHub <redacted>

Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)
author Georgi Gerganov <redacted>
Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)
committer GitHub <redacted>
Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal

index dc7a0af2769dccfe158f01db9f210129b7165c95..22240bab472493178ea8312cb7ea934f3fb25321 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
  }
  
  void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
+#pragma METAL fp math_mode(safe)
      float amax = 0.0f; // absolute max
      float max  = 0.0f;
  
@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
  }
  
  void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
+#pragma METAL fp math_mode(safe)
      float amax = 0.0f; // absolute max
      float max  = 0.0f;
  
@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
  }
  
  void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
+#pragma METAL fp math_mode(safe)
      float amax = 0.0f; // absolute max
  
      for (int j = 0; j < QK8_0; j++) {
author	Georgi Gerganov <redacted>
	Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)
committer	GitHub <redacted>
	Fri, 4 Jul 2025 16:19:09 +0000 (19:19 +0300)