#include <string>
#include <unordered_map>
#include <vector>
+#include <thread>
+#include <mutex>
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
std::vector<enum ggml_type> include_types;
};
-const int64_t SCRATCH_ELEMENTS = 32*32;
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;
stats.num_samples += nelements;
}
+void combine_error_stats(error_stats & into, const error_stats & from) {
+ into.num_samples += from.num_samples;
+ into.total_error += from.total_error;
+ if (from.max_error > into.max_error) into.max_error = from.max_error;
+ for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
+void test_roundtrip_on_chunk(
+ const ggml_tensor * layer,
+ int64_t offset,
+ int64_t chunk_size,
+ const quantize_fns_t & qfns,
+ bool use_reference,
+ float * input_scratch,
+ char * quantized_scratch,
+ float * output_scratch,
+ error_stats & stats) {
+
+ if (layer->type == GGML_TYPE_F16) {
+ for (int i = 0; i < chunk_size; i++) {
+ input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+ }
+ } else {
+ input_scratch = ggml_get_data_f32(layer) + offset;
+ }
+
+ if (use_reference) {
+ qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+ } else {
+ qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+ }
+ qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+
+ update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
// Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer(
std::string & name,
const quantize_fns_t & qfns,
bool use_reference,
const ggml_tensor * layer,
- float * input_scratch,
- char *quantized_scratch,
- float * output_scratch,
- error_stats & total_error) {
+ std::vector<float> & input_scratch,
+ std::vector<char> & quantized_scratch,
+ std::vector<float> & output_scratch,
+ error_stats & total_error,
+ int max_thread = 0) {
assert(tensor_is_contiguous(layer));
error_stats layer_error {};
- int64_t nelements = ggml_nelements(layer);
-
- for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
- int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
+ uint64_t nelements = ggml_nelements(layer);
- if (layer->type == GGML_TYPE_F16) {
- for (int i = 0; i < chunk_size; i++) {
- input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+ float* input_scratch_ptr = nullptr;
+ if (layer->type == GGML_TYPE_F16) {
+ if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+ input_scratch_ptr = input_scratch.data();
+ }
+ if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+ if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+ if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+ int chunk_size = 32*512;
+ int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+ if (num_chunks < 2 || max_thread < 2) {
+ test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+ output_scratch.data(), print_layer_stats ? layer_error : total_error);
+ } else {
+ auto & stats = print_layer_stats ? layer_error : total_error;
+ std::mutex mutex;
+ uint64_t counter = 0;
+ auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+ &quantized_scratch, &output_scratch, chunk_size] () {
+ error_stats local_stats {};
+ while (true) {
+ std::unique_lock<std::mutex> lock(mutex);
+ uint64_t offset = counter; counter += chunk_size;
+ if (offset >= nelements) {
+ combine_error_stats(stats, local_stats);
+ break;
+ }
+ lock.unlock();
+ uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+ test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+ quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
}
- } else {
- input_scratch = ggml_get_data_f32(layer) + offset;
- }
-
- if (use_reference) {
- qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
- } else {
- qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
- }
- qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
-
- update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
- if (print_layer_stats) {
- update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
- }
+ };
+ int nthread = std::min(num_chunks, max_thread);
+ std::vector<std::thread> workers(nthread-1);
+ for (auto& w : workers) w = std::thread(compute);
+ compute();
+ for (auto& w : workers) w.join();
}
+
if (print_layer_stats) {
print_error_stats(name, layer_error, false);
+ combine_error_stats(total_error, layer_error);
}
}
// read command line
+ int max_thread = 0;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
+ } else if (arg == "-n" || arg == "--num-threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ max_thread = atoi(argv[i]);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
}
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
// allocate scratch space
- std::vector<float> input_scratch(SCRATCH_ELEMENTS);
- std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
- std::vector<float> output_scratch(SCRATCH_ELEMENTS);
+ std::vector<float> input_scratch;
+ std::vector<char> quantized_scratch;
+ std::vector<float> output_scratch;
// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
qfns,
params.reference,
kv_tensor.second,
- input_scratch.data(),
- quantized_scratch.data(),
- output_scratch.data(),
- global_stats
+ input_scratch,
+ quantized_scratch,
+ output_scratch,
+ global_stats,
+ max_thread
);
}
#include <memory>
#include <algorithm>
#include <initializer_list>
+#include <thread>
+#include <atomic>
+#include <mutex>
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
// quantization
//
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
ggml_type quantized_type;
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
default: throw format("invalid output file type %d\n", ftype);
};
+ if (nthread <= 0) {
+ nthread = std::thread::hardware_concurrency();
+ }
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
/*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0);
+ std::vector<std::thread> workers;
+ std::mutex mutex;
+
size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
llama_buffer read_data;
new_data = work.addr;
std::vector<int64_t> hist_cur(1 << 4, 0);
- switch (new_type) {
- case GGML_TYPE_Q4_0:
- {
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1:
- {
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
- } break;
- case GGML_TYPE_Q4_2:
- {
- new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
- } break;
- case GGML_TYPE_Q4_3:
- {
- new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
- } break;
- default:
- LLAMA_ASSERT(false);
+ int chunk_size = 32 * 512;
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
+ if (nthread_use < 2) {
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
+ } else {
+ size_t counter = 0;
+ new_size = 0;
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
+ std::vector<int64_t> local_hist;
+ size_t local_size = 0;
+ while (true) {
+ std::unique_lock<std::mutex> lock(mutex);
+ size_t first = counter; counter += chunk_size;
+ if (first >= nelements) {
+ if (!local_hist.empty()) {
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
+ new_size += local_size;
+ }
+ break;
+ }
+ lock.unlock();
+ size_t last = std::min(nelements, first + chunk_size);
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+ }
+ };
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
+ compute();
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
}
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
- enum llama_ftype ftype) {
+ enum llama_ftype ftype,
+ int nthread) {
try {
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
return 0;
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());