quantize_row_q2_K_reference(x, vy, k);
}
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
- (void)hist; // TODO: collect histograms
-
- for (int j = 0; j < n; j += k) {
- block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
- quantize_row_q2_K_reference(src + j, y, k);
- }
- return (n/QK_K*sizeof(block_q2_K));
-}
-
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
float rmin, float rdelta, int nstep, bool use_mad) {
}
}
-size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
if (!quant_weights) {
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
quantize_row_q3_K_reference(x, vy, k);
}
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
- (void)hist; // TODO: collect histograms
-
- for (int j = 0; j < n; j += k) {
- block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
- quantize_row_q3_K_reference(src + j, y, k);
- }
- return (n/QK_K*sizeof(block_q3_K));
-}
-
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
#if QK_K != 256
(void)quant_weights;
#endif
}
-size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
if (!quant_weights) {
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
quantize_row_q4_K_reference(x, y, k);
}
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
- assert(k % QK_K == 0);
- (void)hist; // TODO: collect histograms
-
- for (int j = 0; j < n; j += k) {
- block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
- quantize_row_q4_K_reference(src + j, y, k);
- }
- return (n/QK_K*sizeof(block_q4_K));
-}
-
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
#if QK_K != 256
(void)quant_weights;
#endif
}
-size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
if (!quant_weights) {
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
quantize_row_q5_K_reference(x, y, k);
}
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
- assert(k % QK_K == 0);
- (void)hist; // TODO: collect histograms
-
- for (int j = 0; j < n; j += k) {
- block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
- quantize_row_q5_K_reference(src + j, y, k);
- }
- return (n/QK_K*sizeof(block_q5_K));
-}
-
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
#if QK_K != 256
(void)quant_weights;
#endif
}
-size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
if (!quant_weights) {
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
quantize_row_q6_K_reference(x, y, k);
}
-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK_K == 0);
- (void)hist; // TODO: collect histograms
-
- for (int j = 0; j < n; j += k) {
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
- quantize_row_q6_K_reference(src + j, y, k);
- }
- return (n/QK_K*sizeof(block_q6_K));
-}
-
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
#if QK_K != 256
(void)quant_weights;
#endif
}
-size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
if (!quant_weights) {
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
}
}
-size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
if (!quant_weights) {
- return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
+ quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
+ return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
char * qrow = (char *)dst;
}
}
-size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
if (!quant_weights) {
- return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
+ quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
+ return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
char * qrow = (char *)dst;
}
}
-size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
if (!quant_weights) {
- return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
+ quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
+ return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
char * qrow = (char *)dst;
}
}
-size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
if (!quant_weights) {
- return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
+ quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
+ return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
char * qrow = (char *)dst;
return nrow * row_size;
}
+size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+ (void)quant_weights; // not used
+ const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
+ quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
+ return nrow * row_size;
+}
+
// ====================== "True" 2-bit (de)-quantization
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
#endif
}
-void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
}
#endif
-void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
int * kmap_q2xs;
uint16_t * kneighbors_q2xs;
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
for (int k = 0; k < grid_size; ++k) {
int8_t * pos = (int8_t *)(the_grid + k);
}
num_neighbors += n;
}
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
iq2_data[gindex].neighbours = kneighbors_q2xs;
int counter = 0;
}
}
-size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return nrow * nblock * sizeof(block_iq2_xxs);
}
-size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
int * kmap_q3xs;
uint16_t * kneighbors_q3xs;
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
for (int k = 0; k < grid_size; ++k) {
int8_t * pos = (int8_t *)(the_grid + k);
}
num_neighbors += n;
}
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
iq3_data[gindex].neighbours = kneighbors_q3xs;
int counter = 0;
}
}
-size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
}
#define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
float scales[QK_K/IQ3S_BLOCK_SIZE];
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
assert(k % QK_K == 0);
- quantize_iq3_s(x, y, 1, k, NULL, NULL);
+ quantize_iq3_s(x, y, 1, k, NULL);
}
}
}
-size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
}
-static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
+static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
float * scales, float * weight, uint8_t * L,
const int8_t * values,
}
}
-size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK4_NL == 0);
int nblock = n_per_row/QK4_NL;
char * qrow = (char *)dst;
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
assert(k % QK4_NL == 0);
- quantize_iq4_nl(x, y, 1, k, NULL, NULL);
+ quantize_iq4_nl(x, y, 1, k, NULL);
}
-size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
#if QK_K == 64
- return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
#else
- (void)hist;
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
assert(k % QK_K == 0);
- quantize_iq4_xs(x, y, 1, k, NULL, NULL);
+ quantize_iq4_xs(x, y, 1, k, NULL);
}
// =============================== 2.5625 bpw
}
}
-size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
- (void)hist;
+size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
assert(k % QK_K == 0);
- quantize_iq2_s(x, y, 1, k, NULL, NULL);
+ quantize_iq2_s(x, y, 1, k, NULL);
}
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-//
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-//
-size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(enum ggml_type type);
}
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch(quant) {
- case GGML_TYPE_F32:
- memcpy(to, from, sizeof(float) * ne);
- break;
- case GGML_TYPE_Q4_0:
- ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q4_1:
- ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q5_0:
- ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q5_1:
- ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q8_0:
- ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q2_K:
- ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q3_K:
- ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q4_K:
- ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q5_K:
- ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data());
- break;
- case GGML_TYPE_Q6_K:
- ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data());
- break;
- default:
- GGML_ASSERT(false);
- }
+ ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
}
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
ggml_critical_section_end();
}
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK4_0 == 0);
- const int nb = k / QK4_0;
-
- for (int b = 0; b < n; b += k) {
- block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
-
- quantize_row_q4_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK4_0; j += 2) {
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK4_1 == 0);
- const int nb = k / QK4_1;
-
- for (int b = 0; b < n; b += k) {
- block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
-
- quantize_row_q4_1_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK4_1; j += 2) {
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK5_0 == 0);
- const int nb = k / QK5_0;
-
- for (int b = 0; b < n; b += k) {
- block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
-
- quantize_row_q5_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- uint32_t qh;
- memcpy(&qh, &y[i].qh, sizeof(qh));
-
- for (int j = 0; j < QK5_0; j += 2) {
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
- // cast to 16 bins
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK5_0*sizeof(block_q5_0));
-}
-
-size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK5_1 == 0);
- const int nb = k / QK5_1;
-
- for (int b = 0; b < n; b += k) {
- block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
-
- quantize_row_q5_1_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- uint32_t qh;
- memcpy(&qh, &y[i].qh, sizeof(qh));
-
- for (int j = 0; j < QK5_1; j += 2) {
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
- // cast to 16 bins
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK5_1*sizeof(block_q5_1));
-}
-
-size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK8_0 == 0);
- const int nb = k / QK8_0;
-
- for (int b = 0; b < n; b += k) {
- block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
-
- quantize_row_q8_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK8_0; ++j) {
- const int8_t vi = y[i].qs[j];
-
- hist[vi/16 + 8]++;
- }
- }
- }
-
- return (n/QK8_0*sizeof(block_q8_0));
-}
-
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
return
type == GGML_TYPE_IQ2_XXS ||
type == GGML_TYPE_IQ1_S;
}
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
- int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
+size_t ggml_quantize_chunk(
+ enum ggml_type type,
+ const float * src,
+ void * dst,
+ int start,
+ int nrows,
+ int n_per_row,
+ const float * imatrix) {
+ const int n = nrows * n_per_row;
+
+ if (ggml_quantize_requires_imatrix(type)) {
+ GGML_ASSERT(imatrix != NULL);
+ }
+
+ GGML_ASSERT(start % type_traits[type].blck_size == 0);
+ GGML_ASSERT(start % n_per_row == 0);
+
ggml_quantize_init(type); // this is noop if already initialized
+
+ const size_t start_row = start / n_per_row;
+ const size_t row_size = ggml_row_size(type, n_per_row);
+
size_t result = 0;
- int n = nrows * n_per_row;
+
switch (type) {
- case GGML_TYPE_Q4_0:
- {
- GGML_ASSERT(start % QK4_0 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q4_1:
- {
- GGML_ASSERT(start % QK4_1 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_0:
- {
- GGML_ASSERT(start % QK5_0 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_1:
- {
- GGML_ASSERT(start % QK5_1 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q8_0:
- {
- GGML_ASSERT(start % QK8_0 == 0);
- block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
- result = ggml_quantize_q8_0(src + start, block, n, n, hist);
- } break;
- case GGML_TYPE_Q2_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q3_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q4_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q6_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_XXS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- GGML_ASSERT(imatrix);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_XS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- GGML_ASSERT(imatrix);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ3_XXS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ3_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ1_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ4_NL:
+ case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#if QK_K == 64
- case GGML_TYPE_IQ4_XS:
-#endif
- {
- GGML_ASSERT(start % QK4_NL == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
-#if QK_K != 64
- case GGML_TYPE_IQ4_XS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+#else
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#endif
case GGML_TYPE_F16:
{
default:
assert(false);
}
+
+ GGML_ASSERT(result == nrows * row_size);
+
return result;
}
GGML_API void ggml_quantize_init(enum ggml_type type);
GGML_API void ggml_quantize_free(void);
- // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
- GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-
- GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
- GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-
// some quantization type cannot be used without an importance matrix
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
// calls ggml_quantize_init internally (i.e. can allocate memory)
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
- int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+ GGML_API size_t ggml_quantize_chunk(
+ enum ggml_type type,
+ const float * src,
+ void * dst,
+ int start,
+ int nrows,
+ int n_per_row,
+ const float * imatrix);
//
// gguf