ggml : minor naming changes (llama/8433)

author Georgi Gerganov <redacted>

Fri, 12 Jul 2024 07:46:02 +0000 (10:46 +0300)

committer Georgi Gerganov <redacted>

Thu, 8 Aug 2024 19:48:46 +0000 (22:48 +0300)
author Georgi Gerganov <redacted>
Fri, 12 Jul 2024 07:46:02 +0000 (10:46 +0300)
committer Georgi Gerganov <redacted>
Thu, 8 Aug 2024 19:48:46 +0000 (22:48 +0300)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 1e367753738d9fce76f9bd3983d918baf00bb4a8..f2145ff356cbd1bd16251ea6b0341fc6e6e78b7b 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -714,9 +714,9 @@ extern "C" {
      GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
      GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
  
-    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
-    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
  
      GGML_DEPRECATED(
      GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
@@ -2410,31 +2410,31 @@ extern "C" {
  #endif
      typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
      typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                      const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
-                                             int64_t k, int64_t bx);
-    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                      const void * GGML_RESTRICT y, int nr, int nc);
-    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                      const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_from_float_to_mat_t)
+                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);
  
      typedef struct {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        ggml_to_float_t   to_float;
-        ggml_from_float_t from_float;
-        ggml_from_float_t from_float_reference;
-        ggml_vec_dot_t    vec_dot;
-        enum ggml_type    vec_dot_type;
-        int64_t           nrows; // number of rows to process simultaneously;
-        int64_t           ncols; // number of columns to process simultaneously;
-        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+        const char             * type_name;
+        int64_t                  blck_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
+        size_t                   type_size;
+        bool                     is_quantized;
+        ggml_to_float_t          to_float;
+        ggml_from_float_t        from_float;
+        ggml_from_float_t        from_float_ref;
          ggml_from_float_to_mat_t from_float_to_mat;
-        ggml_gemv_t       gemv;
-        ggml_gemm_t       gemm;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+        int64_t                  ncols; // number of columns to process simultaneously
+        ggml_gemv_t              gemv;
+        ggml_gemm_t              gemm;
      } ggml_type_traits_t;
  
      GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c

index 0f7e0560964b8b3903292d6813cc94ae6ffef006..bf295b01c8dc537cee95dcfe508de7bc3421b846 100644 (file)
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -658,7 +658,7 @@ static inline __m128i packNibbles( __m256i bytes ) {
  #endif  //__loongarch_asx
  
  // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
      static const int qk = QK4_0;
  
      assert(k % qk == 0);
@@ -696,11 +696,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
  }
  
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q4_0_reference(x, y, k);
+    quantize_row_q4_0_ref(x, y, k);
  }
  
  
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
+void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
      const int qk = QK4_1;
  
      assert(k % qk == 0);
@@ -738,10 +738,10 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
  }
  
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q4_1_reference(x, y, k);
+    quantize_row_q4_1_ref(x, y, k);
  }
  
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
+void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
      static const int qk = QK5_0;
  
      assert(k % qk == 0);
@@ -786,10 +786,10 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
  }
  
  void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q5_0_reference(x, y, k);
+    quantize_row_q5_0_ref(x, y, k);
  }
  
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
+void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
      const int qk = QK5_1;
  
      assert(k % qk == 0);
@@ -834,11 +834,11 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
  }
  
  void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q5_1_reference(x, y, k);
+    quantize_row_q5_1_ref(x, y, k);
  }
  
  // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
      assert(k % QK8_0 == 0);
      const int nb = k / QK8_0;
  
@@ -1144,12 +1144,12 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
  #else
      GGML_UNUSED(nb);
      // scalar
-    quantize_row_q8_0_reference(x, y, k);
+    quantize_row_q8_0_ref(x, y, k);
  #endif
  }
  
  // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
+void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
      assert(QK8_1 == 32);
      assert(k % QK8_1 == 0);
      const int nb = k / QK8_1;
@@ -1508,7 +1508,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
  #else
      GGML_UNUSED(nb);
      // scalar
-    quantize_row_q8_1_reference(x, y, k);
+    quantize_row_q8_1_ref(x, y, k);
  #endif
  }
  
@@ -1899,7 +1899,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
  
  //========================- 2-bit (de)-quantization
  
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
+void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int nb = k / QK_K;
  
@@ -2002,7 +2002,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
  }
  
  void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
-    quantize_row_q2_K_reference(x, vy, k);
+    quantize_row_q2_K_ref(x, vy, k);
  }
  
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
@@ -2226,7 +2226,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
  size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
      if (!quant_weights) {
-        quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
      }
      else {
          char * qrow = (char *)dst;
@@ -2241,7 +2241,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
  
  //========================= 3-bit (de)-quantization
  
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
+void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int nb = k / QK_K;
  
@@ -2368,7 +2368,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
  }
  
  void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
-    quantize_row_q3_K_reference(x, vy, k);
+    quantize_row_q3_K_ref(x, vy, k);
  }
  
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
@@ -2458,7 +2458,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
  size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
      if (!quant_weights) {
-        quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
      }
      else {
          char * qrow = (char *)dst;
@@ -2473,7 +2473,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
  
  // ====================== 4-bit (de)-quantization
  
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
+void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int nb = k / QK_K;
  
@@ -2572,7 +2572,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
  void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_q4_K * restrict y = vy;
-    quantize_row_q4_K_reference(x, y, k);
+    quantize_row_q4_K_ref(x, y, k);
  }
  
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -2651,7 +2651,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
  size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
      if (!quant_weights) {
-        quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
      }
      else {
          char * qrow = (char *)dst;
@@ -2666,7 +2666,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
  
  // ====================== 5-bit (de)-quantization
  
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
+void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int64_t nb = k / QK_K;
  
@@ -2783,7 +2783,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
  void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_q5_K * restrict y = vy;
-    quantize_row_q5_K_reference(x, y, k);
+    quantize_row_q5_K_ref(x, y, k);
  }
  
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -2882,7 +2882,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
  size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
      if (!quant_weights) {
-        quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
      }
      else {
          char * qrow = (char *)dst;
@@ -2897,7 +2897,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
  
  // ====================== 6-bit (de)-quantization
  
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
+void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int64_t nb = k / QK_K;
  
@@ -3001,7 +3001,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
  void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_q6_K * restrict y = vy;
-    quantize_row_q6_K_reference(x, y, k);
+    quantize_row_q6_K_ref(x, y, k);
  }
  
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -3091,7 +3091,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
  size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
      if (!quant_weights) {
-        quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
      }
      else {
          char * qrow = (char *)dst;
@@ -3108,7 +3108,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
      static_assert(QK4_0 == 32, "QK4_0 must be 32");
  
      if (!quant_weights) {
-        quantize_row_q4_0_reference(x, y, n_per_row);
+        quantize_row_q4_0_ref(x, y, n_per_row);
          return;
      }
  
@@ -3134,7 +3134,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
  
  size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      if (!quant_weights) {
-        quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
          return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
      }
      size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -3151,7 +3151,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
      static_assert(QK4_1 == 32, "QK4_1 must be 32");
  
      if (!quant_weights) {
-        quantize_row_q4_1_reference(x, y, n_per_row);
+        quantize_row_q4_1_ref(x, y, n_per_row);
          return;
      }
  
@@ -3179,7 +3179,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
  
  size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      if (!quant_weights) {
-        quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
          return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
      }
      size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -3196,7 +3196,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
      static_assert(QK5_0 == 32, "QK5_0 must be 32");
  
      if (!quant_weights) {
-        quantize_row_q5_0_reference(x, y, n_per_row);
+        quantize_row_q5_0_ref(x, y, n_per_row);
          return;
      }
  
@@ -3233,7 +3233,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
  
  size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      if (!quant_weights) {
-        quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
          return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
      }
      size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -3250,7 +3250,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
      static_assert(QK5_1 == 32, "QK5_1 must be 32");
  
      if (!quant_weights) {
-        quantize_row_q5_1_reference(x, y, n_per_row);
+        quantize_row_q5_1_ref(x, y, n_per_row);
          return;
      }
  
@@ -3286,7 +3286,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
  
  size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      if (!quant_weights) {
-        quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
          return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
      }
      size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -3302,7 +3302,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
  size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
      (void)quant_weights; // not used
      const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
-    quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
+    quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
      return nrow * row_size;
  }
  
@@ -3590,7 +3590,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
  
  //===================================== Q8_K ==============================================
  
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
+void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      const int64_t nb = k / QK_K;
  
@@ -3641,7 +3641,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
  }
  
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q8_K_reference(x, y, k);
+    quantize_row_q8_K_ref(x, y, k);
  }
  
  //===================================== Dot ptoducts =================================
@@ -13542,10 +13542,10 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
  void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_iq3_xxs * restrict y = vy;
-    quantize_row_iq3_xxs_reference(x, y, k);
+    quantize_row_iq3_xxs_ref(x, y, k);
  }
  
-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
+void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
  }
@@ -13758,10 +13758,10 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
  void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_iq3_s * restrict y = vy;
-    quantize_row_iq3_s_reference(x, y, k);
+    quantize_row_iq3_s_ref(x, y, k);
  }
  
-void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
+void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      quantize_iq3_s(x, y, 1, k, NULL);
  }
@@ -14499,7 +14499,7 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
      }
  }
  
-void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
      assert(k % QK4_NL == 0);
      quantize_row_iq4_nl(x, y, k);
  }
@@ -14527,10 +14527,10 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
  void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_iq4_xs * restrict y = vy;
-    quantize_row_iq4_xs_reference(x, y, k);
+    quantize_row_iq4_xs_ref(x, y, k);
  }
  
-void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
+void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      quantize_iq4_xs(x, y, 1, k, NULL);
  }
@@ -14717,7 +14717,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
      return nrow * nblock * sizeof(block_iq2_s);
  }
  
-void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
+void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
      assert(k % QK_K == 0);
      quantize_iq2_s(x, y, 1, k, NULL);
  }
@@ -14725,7 +14725,7 @@ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restri
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
      assert(k % QK_K == 0);
      block_iq2_s * restrict y = vy;
-    quantize_row_iq2_s_reference(x, y, k);
+    quantize_row_iq2_s_ref(x, y, k);
  }
  
  static bool validate_float(float f, size_t i) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h

index 30983b8728fa24c3d655bf6a75ad4075525b2061..88b1f3269646d8cb19a7531a2ef458820a0346e4 100644 (file)
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -12,25 +12,25 @@ extern "C" {
  #endif
  
  // Quantization
-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
  
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 10e96827652c704d8b2a76b9d5536646cef36d7a..f2c58c374163ea6918e9f91f1f4088fa6f22ba79 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -592,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = false,
          .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
          .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
          .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
          .vec_dot_type             = GGML_TYPE_F16,
          .nrows                    = 1,
@@ -604,7 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
          .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
          .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
          .vec_dot_type             = GGML_TYPE_Q8_0,
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -620,7 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
          .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
          .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
          .vec_dot_type             = GGML_TYPE_Q8_1,
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -636,7 +636,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = false,
          .to_float                 = NULL,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = NULL,
          .vec_dot_type             = GGML_TYPE_COUNT,
          .nrows                    = 1,
@@ -648,7 +648,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = false,
          .to_float                 = NULL,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = NULL,
          .vec_dot_type             = GGML_TYPE_COUNT,
          .nrows                    = 1,
@@ -660,7 +660,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
          .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
          .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
          .vec_dot_type             = GGML_TYPE_Q8_0,
          .nrows                    = 1,
@@ -672,7 +672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
          .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
          .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
          .vec_dot_type             = GGML_TYPE_Q8_1,
          .nrows                    = 1,
@@ -684,7 +684,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
          .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
+        .from_float_to_mat        = quantize_mat_q8_0,
          .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
          .vec_dot_type             = GGML_TYPE_Q8_0,
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -692,7 +693,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
  #else
          .nrows                    = 1,
  #endif
-        .from_float_to_mat        = quantize_mat_q8_0,
      },
      [GGML_TYPE_Q8_1] = {
          .type_name                = "q8_1",
@@ -700,7 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .type_size                = sizeof(block_q8_1),
          .is_quantized             = true,
          .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
          .vec_dot_type             = GGML_TYPE_Q8_1,
          .nrows                    = 1,
      },
@@ -711,7 +711,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
          .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
          .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -723,7 +723,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
          .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
          .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -735,7 +735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
          .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
          .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -747,7 +747,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
          .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
          .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -759,7 +759,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
          .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
          .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -771,7 +771,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -783,7 +783,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -795,7 +795,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
          .from_float               = quantize_row_iq3_xxs,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
          .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -807,7 +807,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
          .from_float               = quantize_row_iq3_s,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_s_reference,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
          .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -819,7 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
          .from_float               = quantize_row_iq2_s,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq2_s_reference,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
          .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -831,7 +831,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -843,7 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -855,7 +855,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
          .from_float               = quantize_row_iq4_nl,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_nl_reference,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
          .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
          .vec_dot_type             = GGML_TYPE_Q8_0,
          .nrows                    = 1,
@@ -867,7 +867,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = true,
          .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
          .from_float               = quantize_row_iq4_xs,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
          .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
          .vec_dot_type             = GGML_TYPE_Q8_K,
          .nrows                    = 1,
@@ -886,7 +886,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
          .is_quantized             = false,
          .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
          .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row,
          .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
          .vec_dot_type             = GGML_TYPE_BF16,
          .nrows                    = 1,
@@ -894,48 +894,48 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
      [GGML_TYPE_Q4_0_4_4] = {
          .type_name                = "q4_0_4x4",
          .blck_size                = QK4_0,
+        .blck_size_interleave     = 4,
          .type_size                = sizeof(block_q4_0),
          .is_quantized             = true,
          .to_float                 = NULL,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = NULL,
          .vec_dot_type             = GGML_TYPE_Q8_0,
          .nrows                    = 1,
          .ncols                    = 4,
-        .interleave_blcksize      = 4,
          .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
          .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
      },
      [GGML_TYPE_Q4_0_4_8] = {
          .type_name                = "q4_0_4x8",
          .blck_size                = QK4_0,
+        .blck_size_interleave     = 8,
          .type_size                = sizeof(block_q4_0),
          .is_quantized             = true,
          .to_float                 = NULL,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = NULL,
          .vec_dot_type             = GGML_TYPE_Q8_0,
          .nrows                    = 1,
          .ncols                    = 4,
-        .interleave_blcksize      = 8,
          .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
          .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
      },
      [GGML_TYPE_Q4_0_8_8] = {
          .type_name                = "q4_0_8x8",
          .blck_size                = QK4_0,
+        .blck_size_interleave     = 8,
          .type_size                = sizeof(block_q4_0),
          .is_quantized             = true,
          .to_float                 = NULL,
          .from_float               = NULL,
-        .from_float_reference     = NULL,
+        .from_float_ref           = NULL,
          .vec_dot                  = NULL,
          .vec_dot_type             = GGML_TYPE_Q8_0,
          .nrows                    = 1,
          .ncols                    = 8,
-        .interleave_blcksize      = 8,
          .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
          .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
      }
@@ -3115,7 +3115,7 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
      return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
  }
  
-GGML_CALL int ggml_blck_size(enum ggml_type type) {
+GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
      return type_traits[type].blck_size;
  }
  
@@ -12192,15 +12192,14 @@ static void ggml_compute_forward_mul_mat(
  
      const enum ggml_type type = src0->type;
  
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
-    int64_t           const matmul_num_cols       = type_traits[type].ncols;
-    int64_t           const interleave_blcksize   = type_traits[type].interleave_blcksize;
-    ggml_from_float_to_mat_t const from_float_to_mat
-                                                  = type_traits[vec_dot_type].from_float_to_mat;
-    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
-    ggml_gemm_t       const gemm                  = type_traits[type].gemm;
+    enum ggml_type           const vec_dot_type         = type_traits[type].vec_dot_type;
+    ggml_from_float_t        const from_float           = type_traits[vec_dot_type].from_float;
+    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits[vec_dot_type].from_float_to_mat;
+    int64_t                  const vec_dot_num_rows     = type_traits[type].nrows;
+    int64_t                  const matmul_num_cols      = type_traits[type].ncols;
+    int64_t                  const blck_size_interleave = type_traits[type].blck_size_interleave;
+    ggml_gemv_t              const gemv                 = type_traits[type].gemv;
+    ggml_gemm_t              const gemm                 = type_traits[type].gemm;
  
      GGML_ASSERT(ne0 == ne01);
      GGML_ASSERT(ne1 == ne11);
@@ -12264,14 +12263,14 @@ UseGgmlGemm1:;
                      for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
                          from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                            (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                          4, ne10, interleave_blcksize);
+                                          4, ne10, blck_size_interleave);
                      }
                      i11_processed = ne11 - ne11 % 4;
                  }
                  for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
-                    from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                           ne10);
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                           ne10);
                  }
              }
          }
@@ -12355,7 +12354,7 @@ UseGgmlGemm2:;
          int64_t src0_start = (ith * ne01) / nth;
          int64_t src0_end   = ((ith + 1) * ne01) / nth;
          src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-        src0_end   = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+        src0_end   = (src0_end   % matmul_num_cols) ? src0_end   + matmul_num_cols - (src0_end   % matmul_num_cols): src0_end;
          if (src0_start >= src0_end) return;
  
          // If there are more than three rows in src1, use gemm; otherwise, use gemv.
@@ -12413,11 +12412,11 @@ static void ggml_compute_forward_mul_mat_id(
  
      const bool src1_cont = ggml_is_contiguous(src1);
  
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-    int64_t           const matmul_num_cols       = type_traits[type].ncols;
-    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
+    ggml_vec_dot_t    const vec_dot         = type_traits[type].vec_dot;
+    enum ggml_type    const vec_dot_type    = type_traits[type].vec_dot_type;
+    ggml_from_float_t const from_float      = type_traits[vec_dot_type].from_float;
+    int64_t           const matmul_num_cols = type_traits[type].ncols;
+    ggml_gemv_t       const gemv            = type_traits[type].gemv;
  
      // we don't support permuted src0 or src1
      GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -12458,9 +12457,9 @@ static void ggml_compute_forward_mul_mat_id(
          for (int64_t i13 = 0; i13 < ne13; ++i13) {
              for (int64_t i12 = 0; i12 < ne12; ++i12) {
                  for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
-                    from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                           ne10);
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                               ne10);
                  }
              }
          }
@@ -21062,8 +21061,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                  (int64_t) info->ne[3];
  
              if (ne % ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
-                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
+                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
+                        __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
                  fclose(file);
                  gguf_free(ctx);
                  return NULL;
author	Georgi Gerganov <redacted>
	Fri, 12 Jul 2024 07:46:02 +0000 (10:46 +0300)
committer	Georgi Gerganov <redacted>
	Thu, 8 Aug 2024 19:48:46 +0000 (22:48 +0300)
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml-quants.c		patch \| blob \| history
ggml/src/ggml-quants.h		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history