// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
- ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+ const auto * qtype = ggml_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
- qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+ qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
}
static void test_roundtrip_on_chunk(
- const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+ const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
) {
if (layer->type == GGML_TYPE_F16) {
// Run quantization function for a single layer and update error stats
static void test_roundtrip_on_layer(
- std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+ std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
- if (qfns.from_float && qfns.to_float) {
+ const auto * qfns = ggml_get_type_traits(type);
+ if (qfns->from_float && qfns->to_float) {
if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type));
}
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
- qfns,
+ *qfns,
params.reference,
kv_tensor.second,
input_scratch,
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
- typedef struct {
+ struct ggml_type_traits {
const char * type_name;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
- } ggml_type_traits_t;
+ };
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+ GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
}
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
// convert src0 to float
if (type != GGML_TYPE_F32) {
- ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
- ggml_to_float_t const to_float = type_traits.to_float;
+ const auto * type_traits = ggml_get_type_traits(type);
+ ggml_to_float_t const to_float = type_traits->to_float;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
// TODO: find the optimal value
const int64_t min_batch = 32;
- return (ggml_is_contiguous(src0) &&
- ggml_is_contiguous(src1) &&
- src1->type == GGML_TYPE_F32 &&
- (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
+ return ggml_is_contiguous(src0) &&
+ ggml_is_contiguous(src1) &&
+ src1->type == GGML_TYPE_F32 &&
+ (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
+ (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
}
case GGML_OP_OUT_PROD:
- return (op->src[0]->type == GGML_TYPE_F32 &&
- op->src[1]->type == GGML_TYPE_F32 &&
- ggml_is_matrix(src0) &&
- ggml_is_matrix(src1) &&
- ggml_is_contiguous(src0) &&
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ ggml_is_matrix(src0) &&
+ ggml_is_matrix(src1) &&
+ ggml_is_contiguous(src0) &&
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
+ (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
default:
return false;
return;
}
- ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
+ const auto * tt = ggml_get_type_traits(quant);
- ggml_to_float_t dequant_fn = tt.to_float;
+ ggml_to_float_t dequant_fn = tt->to_float;
dequant_fn(from, to, ne);
}
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
};
// For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
GGML_ASSERT(type < GGML_TYPE_COUNT);
- return type_traits[type];
+ return &type_traits[type];
}
//
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
- auto funcs = ggml_internal_get_type_traits(ggml_type);
+ const auto * funcs = ggml_get_type_traits(ggml_type);
Stat simple, ggml;
t1 = std::chrono::high_resolution_clock::now();
float fs;
- if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
- else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
+ if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
+ else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
t2 = std::chrono::high_resolution_clock::now();
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
if (iloop > 3) ggml.addResult(fs, t);
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
- auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
+ const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
std::vector<block_q4_0> q40;
std::vector<block_q4_1> q41;
// Note, we do not include this in the timing as in practical application
// we already have the quantized model weights.
if (useQ4_1) {
- funcs.from_float(x1.data(), q41.data(), kVecSize);
+ funcs->from_float(x1.data(), q41.data(), kVecSize);
} else {
- funcs.from_float(x1.data(), q40.data(), kVecSize);
+ funcs->from_float(x1.data(), q40.data(), kVecSize);
}
// Now measure time the dot product needs using the "scalar" version above
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
}
else {
- auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
- vdot.from_float(y1.data(), q8.data(), kVecSize);
- if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
- else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+ const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
+ vdot->from_float(y1.data(), q8.data(), kVecSize);
+ if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
+ else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
}
sumq += result;
t2 = std::chrono::high_resolution_clock::now();
}
float * f32_output = (float *) output.data();
- ggml_type_traits_t qtype;
+ const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
if (ggml_is_quantized(tensor->type)) {
- qtype = ggml_internal_get_type_traits(tensor->type);
- if (qtype.to_float == NULL) {
+ if (qtype->to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16 &&
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
- qtype.to_float(tensor->data, f32_output, nelements);
+ qtype->to_float(tensor->data, f32_output, nelements);
} else {
GGML_ABORT("fatal error"); // unreachable
}
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
- qtype.to_float(inbuf, outbuf, nels);
+ qtype->to_float(inbuf, outbuf, nels);
}
};
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
std::vector<uint8_t> buf(ggml_nbytes(t));
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
- ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+ const auto * tt = ggml_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
std::vector<float> vq(ggml_blck_size(t->type));
bool quantized = ggml_is_quantized(t->type);
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
- tt.to_float(&buf[i], vq.data(), bs);
+ tt->to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ABORT("fatal error");
}
// Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
- qfns.from_float(test_data, tmp_q.data(), test_size);
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+ qfns->from_float(test_data, tmp_q.data(), test_size);
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}
// Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);
- qfns.from_float(test_data, tmp_q.data(), test_size);
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+ qfns->from_float(test_data, tmp_q.data(), test_size);
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
- qfns.from_float_ref(test_data, tmp_q.data(), test_size);
- qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+ qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+ qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
// Total dot product error
static float dot_product_error(
- ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+ const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+ const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
- qfns.from_float(test_data1, tmp_q1.data(), test_size);
- vdot.from_float(test_data2, tmp_q2.data(), test_size);
+ qfns->from_float(test_data1, tmp_q1.data(), test_size);
+ vdot->from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
- qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+ qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
const float dot_ref = dot_product(test_data1, test_data2, test_size);
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+ const auto * qfns = ggml_get_type_traits(type);
// deprecated - skip
- if (qfns.blck_size == 0) {
+ if (qfns->blck_size == 0) {
continue;
}
printf("Testing %s\n", ggml_type_name((ggml_type) i));
ggml_quantize_init(ei);
- if (qfns.from_float && qfns.to_float) {
+ if (qfns->from_float && qfns->to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
printf(" --type TYPE set test type as");
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+ const auto * qfns = ggml_get_type_traits(type);
if (ggml_type_name(type) != NULL) {
- if (qfns.from_float && qfns.to_float) {
+ if (qfns->from_float && qfns->to_float) {
printf(" %s", ggml_type_name(type));
}
}
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+ const auto * qfns = ggml_get_type_traits(type);
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
continue;
}
- if (qfns.from_float && qfns.to_float) {
+ if (qfns->from_float && qfns->to_float) {
printf("%s\n", ggml_type_name(type));
ggml_quantize_init(type);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
- qfns.from_float_ref(test_data1, test_q1, size);
+ qfns->from_float_ref(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
- qfns.from_float(test_data1, test_q1, size);
+ qfns->from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
if (params.op_dequantize_row_q) {
printf(" dequantize_row_q\n");
- qfns.from_float(test_data1, test_q1, largest);
+ qfns->from_float(test_data1, test_q1, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
- qfns.to_float(test_q1, test_out, size);
+ qfns->to_float(test_q1, test_out, size);
return test_out[0];
};
size_t quantized_size = ggml_row_size(type, size);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
- vdot.from_float(test_data1, test_q1, size);
+ const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+ vdot->from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
if (params.op_vec_dot_q) {
printf(" vec_dot_q\n");
- qfns.from_float(test_data1, test_q1, largest);
- qfns.from_float(test_data2, test_q2, largest);
+ qfns->from_float(test_data1, test_q1, largest);
+ qfns->from_float(test_data2, test_q2, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
float result;
- qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+ qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
return result;
};
size_t quantized_size = ggml_row_size(type, size);