ggml : fix quants nans when all the group weights are very close to zero (llama/7313)

author slaren <redacted>

Sat, 18 May 2024 00:39:54 +0000 (02:39 +0200)

committer Georgi Gerganov <redacted>

Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
author slaren <redacted>
Sat, 18 May 2024 00:39:54 +0000 (02:39 +0200)
committer Georgi Gerganov <redacted>
Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
diff --git a/src/ggml-quants.c b/src/ggml-quants.c

index 9b291d522eba0909bd1a5516f8dcd6378ce8fc4a..7008e5d8b6c0d62d56aa421240351d99ef73493e 100644 (file)
--- a/src/ggml-quants.c
+++ b/src/ggml-quants.c
@@ -14,6 +14,12 @@
  #include <stdlib.h> // for qsort
  #include <stdio.h>  // for GGML_ASSERT
  
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
  #if defined(_MSC_VER)
  // disable "possible loss of data" to avoid warnings for hundreds of casts
  // we should just be careful :)
@@ -1109,7 +1115,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
          float ax = fabsf(x[i]);
          if (ax > amax) { amax = ax; max = x[i]; }
      }
-    if (amax < 1e-30f) { // all zero
+    if (amax < GROUP_MAX_EPS) { // all zero
          for (int i = 0; i < n; ++i) {
              L[i] = 0;
          }
@@ -1177,7 +1183,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
          float ax = fabsf(x[i]);
          if (ax > amax) { amax = ax; max = x[i]; }
      }
-    if (!amax) { // all zero
+    if (amax < GROUP_MAX_EPS) { // all zero
          for (int i = 0; i < n; ++i) { L[i] = 0; }
          return 0.f;
      }
@@ -1646,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
              break;
          }
      }
-    return sumlx / suml2;
+    return sumlx/suml2;
  }
  
  static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
@@ -2653,7 +2659,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
  
          }
  
-        if (!max_abs_scale) {
+        if (max_abs_scale < GROUP_MAX_EPS) {
              memset(&y[i], 0, sizeof(block_q6_K));
              y[i].d = GGML_FP32_TO_FP16(0.f);
              x += QK_K;
@@ -2805,7 +2811,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
  
          }
  
-        if (!max_abs_scale) {
+        if (max_abs_scale < GROUP_MAX_EPS) {
              memset(&y[i], 0, sizeof(block_q6_K));
              y[i].d = GGML_FP32_TO_FP16(0.f);
              x += QK_K;
@@ -12599,7 +12605,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
              }
              float max = xval[0];
              for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < GROUP_MAX_EPS) {
                  scales[ib] = 0;
                  memset(L, 0, 32);
                  continue;
@@ -12775,7 +12781,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
              }
              float max = xval[0];
              for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < GROUP_MAX_EPS) {
                  scales[ib] = 0;
                  memset(L, 0, 16);
                  continue;
@@ -13216,7 +13222,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
              }
              float max = xval[0];
              for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < GROUP_MAX_EPS_IQ3_XXS) {
                  scales[ib] = 0;
                  memset(L, 0, 32);
                  continue;
@@ -13756,7 +13762,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
              for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
              float max = fabsf(xb[0]);
              for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
-            if (!max) {
+            if (max < GROUP_MAX_EPS_IQ1_S) {
                  scales[ib] = 0;
                  memset(L, 1, block_size);
                  continue;
@@ -13944,7 +13950,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
              }
              float max = fabsf(xb[0]);
              for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
-            if (!max) {
+            if (max < GROUP_MAX_EPS_IQ1_M) {
                  scales[ib] = 0;
                  memset(L, 1, block_size);
                  continue;
@@ -14208,7 +14214,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
                  amax = ax; max = xb[j];
              }
          }
-        if (!amax) {
+        if (amax < GROUP_MAX_EPS) {
              scales[ib] = 0;
              continue;
          }
@@ -14429,7 +14435,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
              }
              float max = xval[0];
              for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < GROUP_MAX_EPS_IQ2_S) {
                  scales[ib] = 0;
                  continue;
              }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 85ef21c2a5c19cfffa6a8e12c0b5246e3a2f27a2..c74e253db4b3bd428bef837e0554490574d684d3 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -16,6 +16,7 @@
  #include <thread>
  #include <vector>
  
+
  static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
      // static RNG initialization (revisit if n_threads stops being constant)
      static const size_t n_threads = std::thread::hardware_concurrency();
@@ -49,6 +50,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
          t.join();
      }
  
+#if 0
+    const char * val_str = getenv("GGML_TEST_EPS");
+    float val = 1e-9f;
+    if (val_str != nullptr) {
+        val = std::stof(val_str);
+        printf("GGML_TEST_EPS=%e\n", val);
+    }
+
+    // test quantization with very small values that may result in nan scales due to division by zero
+    if (ggml_is_quantized(tensor->type)) {
+        for (int i = 0; i < 256; i++) {
+            data[i] = val;
+        }
+    }
+#endif
+
      if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
          ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
      } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
@@ -64,6 +81,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
              }
          }
          ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
+        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
          ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
      } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
          // This is going to create some weird integers though.
author	slaren <redacted>
	Sat, 18 May 2024 00:39:54 +0000 (02:39 +0200)
committer	Georgi Gerganov <redacted>
	Tue, 28 May 2024 11:41:08 +0000 (14:41 +0300)
src/ggml-quants.c		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history