// quantization
//
+ // - ggml_quantize_init can be called multiple times with the same type
+ // it will only initialize the quantization tables for the first call or after ggml_quantize_free
+ // automatically called by ggml_quantize_chunk for convenience
+ //
+ // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
+ // call this at the end of the program to avoid memory leaks
+ //
+ // note: these are thread-safe
+ //
+ GGML_API void ggml_quantize_init(enum ggml_type type);
+ GGML_API void ggml_quantize_free(void);
+
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+ // some quantization type cannot be used without an importance matrix
+ GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
+
+ // calls ggml_quantize_init internally (i.e. can allocate memory)
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
- // These are needed for IQ2_XS and IQ2_XXS quantizations
- GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
- GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
-
//
// gguf
//
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
switch (op->op) {
+ case GGML_OP_CPY:
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
default:
const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+ for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+ const int ibx = row*blocks_per_row + i; // x block index
- const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
if (a->ne[3] != b->ne[3]) {
return false;
}
+ ggml_type a_type = a->type;
+ if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
+ if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+ return false;
+ }
+ }
return true;
} break;
case GGML_OP_GET_ROWS:
}
float sumlx = 0;
float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+ for (volatile int i = 0; i < n; ++i) {
+#else
for (int i = 0; i < n; ++i) {
+#endif
int l = nearest_int(iscale * x[i]);
l = MAX(-nmax, MIN(nmax-1, l));
L[i] = l + nmax;
float max = x[0];
float sum_w = weights ? weights[0] : x[0]*x[0];
float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+ for (volatile int i = 1; i < n; ++i) {
+#else
for (int i = 1; i < n; ++i) {
+#endif
if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i];
float w = weights ? weights[i] : x[i]*x[i];
min = 0;
}
if (max <= min) {
- for (int i = 0; i < n; ++i) L[i] = 0;
+ memset(L, 0, n);
*the_min = -min;
return 0.f;
}
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
- int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
if (!quant_weights) {
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
}
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
- int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
if (!quant_weights) {
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
}
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
- int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
if (!quant_weights) {
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
}
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
- int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
if (!quant_weights) {
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
}
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
- int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
if (!quant_weights) {
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
}
if (!quant_weights) {
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
}
- int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
if (!quant_weights) {
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
}
- int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
if (!quant_weights) {
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
}
- int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
if (!quant_weights) {
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
}
- int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+ size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
}
-static void q2xs_init_impl(int grid_size) {
+void iq2xs_init_impl(int grid_size) {
const int gindex = iq2_data_index(grid_size);
if (iq2_data[gindex].grid) {
return;
free(dist2);
}
-void ggml_init_iq2_quantization(enum ggml_type type) {
- if (type == GGML_TYPE_IQ2_XXS) {
- q2xs_init_impl(256);
- }
- else if (type == GGML_TYPE_IQ2_XS) {
- q2xs_init_impl(512);
- }
- else {
- fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
- }
-}
-
-static void q2xs_deinit_impl(int grid_size) {
+void iq2xs_free_impl(int grid_size) {
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
const int gindex = iq2_data_index(grid_size);
if (iq2_data[gindex].grid) {
}
}
-void ggml_deinit_iq2_quantization(enum ggml_type type) {
- if (type == GGML_TYPE_IQ2_XXS) {
- q2xs_deinit_impl(256);
- }
- else if (type == GGML_TYPE_IQ2_XS) {
- q2xs_deinit_impl(512);
- }
- else {
- fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
- }
-}
-
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
int num_neighbors = neighbours[0];
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- GGML_ASSERT(quant_weights);
- GGML_ASSERT(kgrid_q2xs);
- GGML_ASSERT(kmap_q2xs);
- GGML_ASSERT(kneighbors_q2xs);
+ GGML_ASSERT(quant_weights && "missing quantization weights");
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(n%QK_K == 0);
const int kMaxQ = 3;
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- GGML_ASSERT(quant_weights);
- GGML_ASSERT(kmap_q2xs);
- GGML_ASSERT(kgrid_q2xs);
- GGML_ASSERT(kneighbors_q2xs);
+ GGML_ASSERT(quant_weights && "missing quantization weights");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(n%QK_K == 0);
const int kMaxQ = 3;
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+void iq2xs_init_impl(int grid_size);
+void iq2xs_free_impl(int grid_size);
////////////////////////////////////////////////////////////////////////////////
+void ggml_quantize_init(enum ggml_type type) {
+ ggml_critical_section_start();
+
+ switch (type) {
+ case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
+ case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
+ default: // nothing
+ break;
+ }
+
+ ggml_critical_section_end();
+}
+
+void ggml_quantize_free(void) {
+ ggml_critical_section_start();
+
+ iq2xs_free_impl(256);
+ iq2xs_free_impl(512);
+
+ ggml_critical_section_end();
+}
+
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK4_0 == 0);
const int nb = k / QK4_0;
return (n/QK8_0*sizeof(block_q8_0));
}
+bool ggml_quantize_requires_imatrix(enum ggml_type type) {
+ return
+ type == GGML_TYPE_IQ2_XXS ||
+ type == GGML_TYPE_IQ2_XS;
+}
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
- (void)imatrix;
+ ggml_quantize_init(type); // this is noop if already initialized
size_t result = 0;
int n = nrows * n_per_row;
switch (type) {
} break;
case GGML_TYPE_F16:
{
- int elemsize = sizeof(ggml_fp16_t);
+ size_t elemsize = sizeof(ggml_fp16_t);
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
result = n * elemsize;
} break;
case GGML_TYPE_F32:
{
- int elemsize = sizeof(float);
+ size_t elemsize = sizeof(float);
result = n * elemsize;
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
} break;
#include <vector>
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+ // static RNG initialization (revisit if n_threads stops being constant)
+ static const size_t n_threads = std::thread::hardware_concurrency();
+ static std::vector<std::default_random_engine> generators = []() {
+ std::random_device rd;
+ std::vector<std::default_random_engine> vec;
+ vec.reserve(n_threads);
+ //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+ for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+ return vec;
+ }();
+
size_t size = ggml_nelements(tensor);
std::vector<float> data(size);
-#if 0
- static std::default_random_engine generator(1234);
- std::uniform_real_distribution<float> distribution(min, max);
-
- for (size_t i = 0; i < size; i++) {
- data[i] = distribution(generator);
- }
-#else
- auto init_thread = [&](size_t start, size_t end) {
- std::random_device rd;
- std::default_random_engine generator(rd());
+ auto init_thread = [&](size_t ith, size_t start, size_t end) {
std::uniform_real_distribution<float> distribution(min, max);
-
for (size_t i = start; i < end; i++) {
- data[i] = distribution(generator);
+ data[i] = distribution(generators[ith]);
}
};
- size_t n_threads = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
threads.reserve(n_threads);
for (size_t i = 0; i < n_threads; i++) {
size_t start = i*size/n_threads;
size_t end = (i+1)*size/n_threads;
- threads.emplace_back(init_thread, start, end);
+ threads.emplace_back(init_thread, i, start, end);
}
for (auto & t : threads) {
t.join();
}
-#endif
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
int64_t hist[16];
- ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
+ std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+ const float * im = imatrix.data();
+ if (!ggml_quantize_requires_imatrix(tensor->type)) {
+ // when the imatrix is optional, we want to test both quantization with and without imatrix
+ // use one of the random numbers to decide
+ if (data[0] > 0.5f*(min + max)) {
+ im = nullptr;
+ }
+ }
+ ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
GGML_TYPE_Q8_0,
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
- GGML_TYPE_Q6_K
+ GGML_TYPE_Q6_K,
+ GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
};
// unary ops
return 1;
}
+ ggml_quantize_free();
+
printf("\033[1;32mOK\033[0m\n");
return 0;
}