From: Diego Devesa Date: Tue, 8 Oct 2024 12:21:43 +0000 (+0200) Subject: ggml : fix BLAS with unsupported types (llama/9775) X-Git-Tag: upstream/0.0.1642~292 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=d6a7a6856bc79c3b479f4c940524f9946e72e4a0;p=pkg%2Fggml%2Fsources%2Fggml ggml : fix BLAS with unsupported types (llama/9775) * ggml : do not use BLAS with types without to_float * ggml : return pointer from ggml_internal_get_type_traits to avoid unnecessary copies * ggml : rename ggml_internal_get_type_traits -> ggml_get_type_traits it's not really internal if everybody uses it --- diff --git a/include/ggml.h b/include/ggml.h index e7678d07..4508da4f 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -2535,7 +2535,7 @@ extern "C" { typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y, int nr, int nc); - typedef struct { + struct ggml_type_traits { const char * type_name; int64_t blck_size; int64_t blck_size_interleave; // interleave elements in blocks @@ -2551,9 +2551,9 @@ extern "C" { int64_t ncols; // number of columns to process simultaneously ggml_gemv_t gemv; ggml_gemm_t gemm; - } ggml_type_traits_t; + }; - GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp index fbd49d13..627b4dbc 100644 --- a/src/ggml-backend.cpp +++ b/src/ggml-backend.cpp @@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; case GGML_OP_ROPE_BACK: return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: diff --git a/src/ggml-blas.cpp b/src/ggml-blas.cpp index 0c6574de..55f72458 100644 --- a/src/ggml-blas.cpp +++ b/src/ggml-blas.cpp @@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - ggml_to_float_t const to_float = type_traits.to_float; + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s // TODO: find the optimal value const int64_t min_batch = 32; - return (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch)); + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } case GGML_OP_OUT_PROD: - return (op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); default: return false; diff --git a/src/ggml-vulkan.cpp b/src/ggml-vulkan.cpp index 30bd376d..374c6ecd 100644 --- a/src/ggml-vulkan.cpp +++ b/src/ggml-vulkan.cpp @@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg return; } - ggml_type_traits_t tt = ggml_internal_get_type_traits(quant); + const auto * tt = ggml_get_type_traits(quant); - ggml_to_float_t dequant_fn = tt.to_float; + ggml_to_float_t dequant_fn = tt->to_float; dequant_fn(from, to, ne); } diff --git a/src/ggml.c b/src/ggml.c index 03b832d0..3f01092d 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, @@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { +const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { GGML_ASSERT(type < GGML_TYPE_COUNT); - return type_traits[type]; + return &type_traits[type]; } // diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index fa26cc65..ee1a8877 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -133,7 +133,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { std::vector buf(ggml_nbytes(t)); ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t)); - ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); + const auto * tt = ggml_get_type_traits(t->type); size_t bs = ggml_blck_size(t->type); std::vector vq(ggml_blck_size(t->type)); bool quantized = ggml_is_quantized(t->type); @@ -159,7 +159,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt.to_float(&buf[i], vq.data(), bs); + tt->to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ABORT("fatal error"); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index ccf5721a..d50417ba 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) { } // Total quantization error on test data -static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); - qfns.from_float(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->from_float(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); return array_rmse(test_data, tmp_out.data(), test_size); } // Total quantization error on test data -static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); std::vector tmp_out_ref(test_size); - qfns.from_float(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->from_float(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); - qfns.from_float_ref(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size); + qfns->from_float_ref(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } @@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { // Total dot product error static float dot_product_error( - ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2 + const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2 ) { std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); - auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); - qfns.from_float(test_data1, tmp_q1.data(), test_size); - vdot.from_float(test_data2, tmp_q2.data(), test_size); + qfns->from_float(test_data1, tmp_q1.data(), test_size); + vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); const float dot_ref = dot_product(test_data1, test_data2, test_size); @@ -131,10 +131,10 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); // deprecated - skip - if (qfns.blck_size == 0) { + if (qfns->blck_size == 0) { continue; } @@ -143,7 +143,7 @@ int main(int argc, char * argv[]) { printf("Testing %s\n", ggml_type_name((ggml_type) i)); ggml_quantize_init(ei); - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 24e06605..bdbdd90a 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -122,9 +122,9 @@ static void usage(char * argv[]) { printf(" --type TYPE set test type as"); for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); if (ggml_type_name(type) != NULL) { - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { printf(" %s", ggml_type_name(type)); } } @@ -270,12 +270,12 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { continue; } - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { printf("%s\n", ggml_type_name(type)); ggml_quantize_init(type); @@ -285,7 +285,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.from_float_ref(test_data1, test_q1, size); + qfns->from_float_ref(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -299,7 +299,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.from_float(test_data1, test_q1, size); + qfns->from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -310,11 +310,11 @@ int main(int argc, char * argv[]) { if (params.op_dequantize_row_q) { printf(" dequantize_row_q\n"); - qfns.from_float(test_data1, test_q1, largest); + qfns->from_float(test_data1, test_q1, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.to_float(test_q1, test_out, size); + qfns->to_float(test_q1, test_out, size); return test_out[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -328,8 +328,8 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); - vdot.from_float(test_data1, test_q1, size); + const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); + vdot->from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -340,13 +340,13 @@ int main(int argc, char * argv[]) { if (params.op_vec_dot_q) { printf(" vec_dot_q\n"); - qfns.from_float(test_data1, test_q1, largest); - qfns.from_float(test_data2, test_q2, largest); + qfns->from_float(test_data1, test_q1, largest); + qfns->from_float(test_data2, test_q2, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); + qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); return result; }; size_t quantized_size = ggml_row_size(type, size);