From: Piotr Wilkin (ilintar) Date: Thu, 13 Nov 2025 18:54:47 +0000 (+0100) Subject: ggml : add ops SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM (llama/17063) X-Git-Tag: upstream/0.9.4.395~180 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f45db4747f1d15b75a96443ee97a0aa5c6b21196;p=pkg%2Fggml%2Fsources%2Fggml ggml : add ops SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM (llama/17063) * Add ops needed for new hybrid models: SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM * Update ggml/include/ggml.h Co-authored-by: Georgi Gerganov * Update tests/test-backend-ops.cpp Co-authored-by: Georgi Gerganov * Code review * Whitespace * Update tests/test-backend-ops.cpp Co-authored-by: Diego Devesa * This is actually sigmoid, duh. * Add CONST, remove TRI_KEEP, other changes from review * Update tests/test-backend-ops.cpp Co-authored-by: Georgi Gerganov * Update ggml/src/ggml.c Co-authored-by: Georgi Gerganov * Update ggml/src/ggml.c Co-authored-by: Georgi Gerganov * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Aman Gupta * Remove extra script * Update ggml/src/ggml.c Co-authored-by: Diego Devesa * Update tests/test-backend-ops.cpp Co-authored-by: Diego Devesa * moving changes from laptop [no ci] * pre-rebase * Update tests/test-backend-ops.cpp Co-authored-by: Sigbjørn Skjæret * Update tests/test-backend-ops.cpp Co-authored-by: Sigbjørn Skjæret * Refactor tests * ggml : cleanup * cont : fix ggml_fill srcs * tests : add note * ggml : add ggml_fill_inplace * ggml : add asserts * ggml : fix ggml_fill constant cast * cont : ggml_tri minor * Use TENSOR_LOCALS * Fix regression from #14596, regenerate * Don't make commits at night... --------- Co-authored-by: Georgi Gerganov Co-authored-by: Diego Devesa Co-authored-by: Aman Gupta Co-authored-by: Sigbjørn Skjæret --- diff --git a/include/ggml.h b/include/ggml.h index c1ed1a21..605fcfcb 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -475,6 +475,7 @@ extern "C" { GGML_OP_COS, GGML_OP_SUM, GGML_OP_SUM_ROWS, + GGML_OP_CUMSUM, GGML_OP_MEAN, GGML_OP_ARGMAX, GGML_OP_COUNT_EQUAL, @@ -530,6 +531,8 @@ extern "C" { GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, + GGML_OP_TRI, + GGML_OP_FILL, GGML_OP_FLASH_ATTN_EXT, GGML_OP_FLASH_ATTN_BACK, @@ -542,6 +545,7 @@ extern "C" { GGML_OP_RWKV_WKV6, GGML_OP_GATED_LINEAR_ATTN, GGML_OP_RWKV_WKV7, + GGML_OP_SOLVE_TRI, GGML_OP_UNARY, @@ -576,6 +580,8 @@ extern "C" { GGML_UNARY_OP_HARDSWISH, GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_EXP, + GGML_UNARY_OP_EXPM1, + GGML_UNARY_OP_SOFTPLUS, GGML_UNARY_OP_GELU_ERF, GGML_UNARY_OP_XIELU, GGML_UNARY_OP_FLOOR, @@ -620,6 +626,13 @@ extern "C" { GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) }; + enum ggml_tri_type { + GGML_TRI_TYPE_UPPER_DIAG = 0, + GGML_TRI_TYPE_UPPER = 1, + GGML_TRI_TYPE_LOWER_DIAG = 2, + GGML_TRI_TYPE_LOWER = 3 + }; + struct ggml_init_params { // memory pool size_t mem_size; // bytes @@ -957,6 +970,22 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_expm1( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_expm1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_softplus( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_softplus_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sin( struct ggml_context * ctx, struct ggml_tensor * a); @@ -983,6 +1012,10 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_cumsum( + struct ggml_context * ctx, + struct ggml_tensor * a); + // mean along rows GGML_API struct ggml_tensor * ggml_mean( struct ggml_context * ctx, @@ -2187,6 +2220,23 @@ extern "C" { int shift2, int shift3); + // Convert matrix into a triangular one (upper, strict upper, lower or strict lower) by writing + // zeroes everywhere outside the masked area + GGML_API struct ggml_tensor * ggml_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_tri_type type); + + // Fill tensor a with constant c + GGML_API struct ggml_tensor * ggml_fill( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c); + + GGML_API struct ggml_tensor * ggml_fill_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c); // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] @@ -2356,6 +2406,27 @@ extern "C" { struct ggml_tensor * b, struct ggml_tensor * state); + /* Solves a specific equation of the form Ax=B, where A is a triangular matrix + * without zeroes on the diagonal (i.e. invertible). + * B can have any number of columns, but must have the same number of rows as A + * If A is [n, n] and B is [n, m], then the result will be [n, m] as well + * Has O(n^3) complexity (unlike most matrix ops out there), so use on cases + * where n > 100 sparingly, pre-chunk if necessary. + * + * If left = false, solves xA=B instead + * If lower = false, assumes upper triangular instead + * If uni = true, assumes diagonal of A to be all ones (will override actual values) + * + * TODO: currently only lower, right, non-unitriangular variant is implemented + */ + GGML_API struct ggml_tensor * ggml_solve_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool left, + bool lower, + bool uni); + // custom operators typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c index d8e3c48c..c7348cc2 100644 --- a/src/ggml-cpu/ggml-cpu.c +++ b/src/ggml-cpu/ggml-cpu.c @@ -1731,6 +1731,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_sum_rows(params, tensor); } break; + case GGML_OP_CUMSUM: + { + ggml_compute_forward_cumsum(params, tensor); + } break; case GGML_OP_MEAN: { ggml_compute_forward_mean(params, tensor); @@ -1927,6 +1931,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_leaky_relu(params, tensor); } break; + case GGML_OP_TRI: + { + ggml_compute_forward_tri(params, tensor); + } break; + case GGML_OP_FILL: + { + ggml_compute_forward_fill(params, tensor); + } break; case GGML_OP_FLASH_ATTN_EXT: { ggml_compute_forward_flash_attn_ext(params, tensor); @@ -1982,6 +1994,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rwkv_wkv7(params, tensor); } break; + case GGML_OP_SOLVE_TRI: + { + ggml_compute_forward_solve_tri(params, tensor); + } break; case GGML_OP_MAP_CUSTOM1: { ggml_compute_forward_map_custom1(params, tensor); @@ -2140,6 +2156,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_ADD_ID: case GGML_OP_ADD1: case GGML_OP_ACC: + case GGML_OP_CUMSUM: + case GGML_OP_TRI: + case GGML_OP_FILL: { n_tasks = n_threads; } break; @@ -2157,6 +2176,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = 1; } break; case GGML_OP_COUNT_EQUAL: + case GGML_OP_SOLVE_TRI: { n_tasks = n_threads; } break; @@ -2179,6 +2199,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_SOFTPLUS: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_CEIL: case GGML_UNARY_OP_ROUND: diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp index 09f53b47..b6209588 100644 --- a/src/ggml-cpu/ops.cpp +++ b/src/ggml-cpu/ops.cpp @@ -9,6 +9,7 @@ #include #include +#include #include // ggml_compute_forward_dup @@ -1395,6 +1396,56 @@ void ggml_compute_forward_sum( } } +// ggml_compute_forward_cumsum + +static void ggml_compute_forward_cumsum_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + const auto [ir0, ir1] = get_thread_range(params, src0); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + float * src_row = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + float * dst_row = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_vec_cumsum_f32(ne00, dst_row, src_row); + } +} + +void ggml_compute_forward_cumsum( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cumsum_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_sum_rows static void ggml_compute_forward_sum_rows_f32( @@ -2141,6 +2192,83 @@ static void ggml_compute_forward_gelu( } } +// ggml_compute_fill + +static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, ggml_tensor * dst) { + const float c = ggml_get_op_params_f32(dst, 0); + + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const auto [ir0, ir1] = get_thread_range(params, dst); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne2*ne1); + const int64_t i02 = (ir - i03*ne2*ne1)/ne1; + const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1); + + ggml_vec_set_f32(ne0, dst_ptr, c); + } +} + +void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) { + ggml_compute_forward_fill_f32(params, dst); +} + +// ggml_compute_tri + +static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + GGML_TENSOR_UNARY_OP_LOCALS + + const auto [ir0, ir1] = get_thread_range(params, src0); + + bool (*bipred)(int, int); + + switch (ttype) { + case GGML_TRI_TYPE_LOWER: bipred = [](int i, int r) { return i < r; }; break; + case GGML_TRI_TYPE_LOWER_DIAG: bipred = [](int i, int r) { return i <= r; }; break; + case GGML_TRI_TYPE_UPPER: bipred = [](int i, int r) { return i > r; }; break; + case GGML_TRI_TYPE_UPPER_DIAG: bipred = [](int i, int r) { return i >= r; }; break; + default: GGML_ABORT("invalid tri type"); + } + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const float * src_ptr = (const float *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * dst_ptr = ( float *) (( char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1); + + for (int i0 = 0; i0 < ne0; ++i0) { + dst_ptr[i0] = bipred(i0, i01) ? src_ptr[i0] : 0.0f; + } + } +} + +void ggml_compute_forward_tri(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_tri_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_gelu_erf static void ggml_compute_forward_gelu_erf_f32( @@ -8536,7 +8664,7 @@ static void ggml_compute_forward_ssm_scan_f32( // n_head for (int h = ih0; h < ih1; ++h) { // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16 - const float dt_soft_plus = ggml_softplus(dt[h]); + const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]); const float dA = expf(dt_soft_plus * A[h]); const int g = h / (nh / ng); // repeat_interleave @@ -8633,7 +8761,7 @@ static void ggml_compute_forward_ssm_scan_f32( // n_head for (int h = ih0; h < ih1; ++h) { // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16 - const float dt_soft_plus = ggml_softplus(dt[h]); + const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]); const int g = h / (nh / ng); // repeat_interleave // dim @@ -8916,6 +9044,14 @@ void ggml_compute_forward_unary( { ggml_compute_forward_xielu(params, dst); } break; + case GGML_UNARY_OP_EXPM1: + { + ggml_compute_forward_expm1(params, dst); + } break; + case GGML_UNARY_OP_SOFTPLUS: + { + ggml_compute_forward_softplus(params, dst); + } break; default: { GGML_ABORT("fatal error"); @@ -9512,6 +9648,76 @@ void ggml_compute_forward_gla( } } +static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; // A (lower triangular) + const struct ggml_tensor * src1 = dst->src[1]; // B (RHS) + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ne00 == ne01); // A must be square + GGML_ASSERT(ne0 == ne10); // solution cols == B cols + GGML_ASSERT(ne1 == ne11); // solution rows == B rows + + GGML_ASSERT(ne02 == ne12 && ne12 == ne2); + GGML_ASSERT(ne03 == ne13 && ne13 == ne3); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t k = ne10; // number of RHS columns + const int64_t n = ne11; // A is n×n + const int64_t nr = ne02 * ne03 * k; // we're parallelizing on columns here, so seq x token x column will be the unit + + // chunks per thread + const int64_t dr = (nr + nth - 1)/nth; + + // chunk range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + const float * A = (const float *) src0->data; // [n, n, B1, B2] + const float * B = (const float *) src1->data; // [n, k, B1, B2] + float * X = ( float *) dst->data; // [n, k, B1, B2] + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*k); + const int64_t i02 = (ir - i03*ne02*k)/k; + const int64_t i01 = (ir - i03*ne02*k - i02*k); + + const float * A_batch = A + i02 * nb02 / sizeof(float) + i03 * nb03 / sizeof(float); + const float * B_batch = B + i02 * nb12 / sizeof(float) + i03 * nb13 / sizeof(float); + + float * X_batch = X + i02 * nb2 / sizeof(float) + i03 * nb3 / sizeof(float); + + for (int64_t i00 = 0; i00 < n; ++i00) { + float sum = 0.0f; + for (int64_t t = 0; t < i00; ++t) { + sum += A_batch[i00 * n + t] * X_batch[i01 * n + t]; + } + + const float diag = A_batch[i00 * n + i00]; + GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix"); + + X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag; + } + } +} + +void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_compute_forward_solve_tri_f32(params, dst); + } else { + GGML_ABORT("fatal error"); + } +} + // ggml_compute_forward_rwkv_wkv7 static void ggml_compute_forward_rwkv_wkv7_f32( diff --git a/src/ggml-cpu/ops.h b/src/ggml-cpu/ops.h index 2b4127c1..98a0eec1 100644 --- a/src/ggml-cpu/ops.h +++ b/src/ggml-cpu/ops.h @@ -34,6 +34,7 @@ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst); @@ -81,6 +82,8 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_flash_attn_back( const struct ggml_compute_params * params, @@ -96,6 +99,7 @@ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/src/ggml-cpu/unary-ops.cpp b/src/ggml-cpu/unary-ops.cpp index a047537b..1d9873ad 100644 --- a/src/ggml-cpu/unary-ops.cpp +++ b/src/ggml-cpu/unary-ops.cpp @@ -73,6 +73,14 @@ static inline float op_log(float x) { return logf(x); } +static inline float op_expm1(float x) { + return expf(x) - 1.0f; +} + +static inline float op_softplus(float x) { + return (x > 20.0f) ? x : logf(1.0f + expf(x)); +} + static inline float op_floor(float x) { return floorf(x); } @@ -290,6 +298,14 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * unary_op(params, dst); } +void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) { unary_op(params, dst); } diff --git a/src/ggml-cpu/unary-ops.h b/src/ggml-cpu/unary-ops.h index fa45d9f0..bcad5a3a 100644 --- a/src/ggml-cpu/unary-ops.h +++ b/src/ggml-cpu/unary-ops.h @@ -22,6 +22,8 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/src/ggml-cpu/vec.h b/src/ggml-cpu/vec.h index 65c7dfb6..ac59f1fe 100644 --- a/src/ggml-cpu/vec.h +++ b/src/ggml-cpu/vec.h @@ -1416,6 +1416,16 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #endif } +inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + if (i == 0) { + y[i] = x[i]; + } else { + y[i] = y[i - 1] + x[i]; + } + } +} + inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) { diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu index 41de87c0..7d792e60 100644 --- a/src/ggml-cuda/ggml-cuda.cu +++ b/src/ggml-cuda/ggml-cuda.cu @@ -2527,6 +2527,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_UNARY_OP_TRUNC: ggml_cuda_op_trunc(ctx, dst); break; + case GGML_UNARY_OP_EXPM1: + ggml_cuda_op_expm1(ctx, dst); + break; + case GGML_UNARY_OP_SOFTPLUS: + ggml_cuda_op_softplus(ctx, dst); + break; default: return false; } @@ -3829,6 +3835,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: + case GGML_UNARY_OP_SOFTPLUS: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_CEIL: diff --git a/src/ggml-cuda/unary.cu b/src/ggml-cuda/unary.cu index c1dc6ddb..d4866067 100644 --- a/src/ggml-cuda/unary.cu +++ b/src/ggml-cuda/unary.cu @@ -81,6 +81,14 @@ static __device__ __forceinline__ float op_log(float x) { return logf(x); } +static __device__ __forceinline__ float op_expm1(float x) { + return expm1f(x); +} + +static __device__ __forceinline__ float op_softplus(float x) { + return (x > 20.0f) ? x : logf(1.0f + expf(x)); +} + static __device__ __forceinline__ float op_elu(float x) { return (x > 0.f) ? x : expm1f(x); } @@ -233,6 +241,14 @@ void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } + +void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + +void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} /* gated ops */ template diff --git a/src/ggml-cuda/unary.cuh b/src/ggml-cuda/unary.cuh index 2800c75b..609046e5 100644 --- a/src/ggml-cuda/unary.cuh +++ b/src/ggml-cuda/unary.cuh @@ -61,6 +61,10 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/src/ggml-impl.h b/src/ggml-impl.h index ec37a253..fe57d4c5 100644 --- a/src/ggml-impl.h +++ b/src/ggml-impl.h @@ -102,7 +102,7 @@ static bool ggml_op_is_empty(enum ggml_op op) { } } -static inline float ggml_softplus(float input) { +static inline float ggml_compute_softplus_f32(float input) { return (input > 20.0f) ? input : logf(1 + expf(input)); } // diff --git a/src/ggml.c b/src/ggml.c index 9be35c1b..a5846a23 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -935,6 +935,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "COS", "SUM", "SUM_ROWS", + "CUMSUM", "MEAN", "ARGMAX", "COUNT_EQUAL", @@ -990,6 +991,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "TIMESTEP_EMBEDDING", "ARGSORT", "LEAKY_RELU", + "TRI", + "FILL", "FLASH_ATTN_EXT", "FLASH_ATTN_BACK", @@ -1002,6 +1005,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "RWKV_WKV6", "GATED_LINEAR_ATTN", "RWKV_WKV7", + "SOLVE_TRI", "UNARY", @@ -1019,7 +1023,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1039,6 +1043,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cos(x)", "Σx", "Σx_k", + "cumsum(x)", "Σx/n", "argmax(x)", "count_equal(x)", @@ -1094,6 +1099,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", + "tri(x)", + "fill(x, c)", "flash_attn_ext(x)", "flash_attn_back(x)", @@ -1106,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rwkv_wkv6(k, v, r, tf, td, s)", "gated_linear_attn(k, v, q, gate, s)", "rwkv_wkv7(r, w, k, v, a, b, s)", + "A X = B, A triangular, solve X", "unary(x)", @@ -1123,7 +1131,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1142,6 +1150,8 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "HARDSWISH", "HARDSIGMOID", "EXP", + "EXPM1", + "SOFTPLUS", "GELU_ERF", "XIELU", "FLOOR", @@ -1150,7 +1160,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "TRUNC", }; -static_assert(GGML_UNARY_OP_COUNT == 20, "GGML_UNARY_OP_COUNT != 20"); +static_assert(GGML_UNARY_OP_COUNT == 22, "GGML_UNARY_OP_COUNT != 22"); static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = { "REGLU", @@ -2258,6 +2268,30 @@ struct ggml_tensor * ggml_log_inplace( return ggml_log_impl(ctx, a, true); } +struct ggml_tensor * ggml_expm1( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_EXPM1); +} + +struct ggml_tensor * ggml_expm1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXPM1); +} + +struct ggml_tensor * ggml_softplus( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_SOFTPLUS); +} + +struct ggml_tensor * ggml_softplus_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SOFTPLUS); +} + // ggml_sin static struct ggml_tensor * ggml_sin_impl( @@ -2341,6 +2375,21 @@ struct ggml_tensor * ggml_sum_rows( return result; } +// ggml_cumsum + +struct ggml_tensor * ggml_cumsum( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CUMSUM; + result->src[0] = a; + + return result; +} + // ggml_mean struct ggml_tensor * ggml_mean( @@ -2668,8 +2717,8 @@ struct ggml_tensor * ggml_xielu( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, (int32_t) GGML_UNARY_OP_XIELU); - ggml_set_op_params_f32(result, 1, beta + ggml_softplus(alpha_n)); - ggml_set_op_params_f32(result, 2, ggml_softplus(alpha_p)); + ggml_set_op_params_f32(result, 1, beta + ggml_compute_softplus_f32(alpha_n)); + ggml_set_op_params_f32(result, 2, ggml_compute_softplus_f32(alpha_p)); ggml_set_op_params_f32(result, 3, beta); ggml_set_op_params_f32(result, 4, eps); @@ -5028,6 +5077,61 @@ struct ggml_tensor * ggml_timestep_embedding( return result; } +// ggml_tri + +struct ggml_tensor * ggml_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_tri_type type) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->ne[0] == a->ne[1]); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + ggml_set_op_params_i32(result, 0, type); + + result->op = GGML_OP_TRI; + result->src[0] = a; + + return result; +} + +// ggml_fill + +static struct ggml_tensor * ggml_fill_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c, + bool inplace) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(a)); + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params_f32(result, 0, c); + + result->op = GGML_OP_FILL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_fill( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c) { + return ggml_fill_impl(ctx, a, c, false); +} + +struct ggml_tensor * ggml_fill_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c) { + return ggml_fill_impl(ctx, a, c, true); +} + // ggml_argsort struct ggml_tensor * ggml_argsort( @@ -5882,6 +5986,41 @@ struct ggml_tensor * ggml_opt_step_sgd( return result; } +// solve_tri + +struct ggml_tensor * ggml_solve_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool left, + bool lower, + bool uni) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + + // A must be square and lower diagonal + GGML_ASSERT(a->ne[0] == a->ne[1]); + // B must have same outer dimension as A + GGML_ASSERT(a->ne[1] == b->ne[1]); + + // batch dimensions must be equal + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->ne[3] == b->ne[3]); + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(b)); + + GGML_ASSERT(lower && left && !uni); // TODO: support other variants + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, b->ne[0], b->ne[1], b->ne[2], b->ne[3]); + + result->op = GGML_OP_SOLVE_TRI; + result->src[0] = a; + result->src[1] = b; + + return result; +} + //////////////////////////////////////////////////////////////////////////////// struct ggml_hash_set ggml_hash_set_new(size_t size) { @@ -6454,6 +6593,16 @@ static void ggml_compute_backward( ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad)); } } break; + case GGML_UNARY_OP_EXPM1: { + if (src0_needs_grads) { + ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_exp(ctx, src0))); + } + } break; + case GGML_UNARY_OP_SOFTPLUS: { + if (src0_needs_grads) { + ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sigmoid(ctx, src0))); + } + } break; default: { fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n", __func__, ggml_unary_op_name(ggml_get_unary_op(tensor))); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 38b7ddf2..b1179396 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -175,6 +175,38 @@ static void init_tensor_kq_mask(ggml_tensor * tensor, float min = -1.0f, float m ggml_backend_tensor_set(tensor, data_f16.data(), 0, data_f16.size()*sizeof(ggml_fp16_t)); } +// generate a lower triangular matrix +static void init_tensor_tril(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + GGML_ASSERT(tensor->ne[0] == tensor->ne[1]); + + GGML_TENSOR_LOCALS(int32_t, ne, tensor, ne); + GGML_TENSOR_LOCALS(size_t, nb, tensor, nb); + + std::vector data_f32(ne0*ne1*ne2*ne3); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(min, max); + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = 0; i1 < ne1; i1++) { + for (int64_t i0 = 0; i0 < ne0; i0++) { + int64_t idx = (i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3) / sizeof(float); + if (i0 <= i1) { + data_f32[idx] = dis(gen); + } else { + data_f32[idx] = 0.0f; + } + } + } + } + } + + ggml_backend_tensor_set(tensor, data_f32.data(), 0, ggml_nbytes(tensor)); +} + static std::vector tensor_to_float(const ggml_tensor * t) { std::vector tv; tv.reserve(ggml_nelements(t)); @@ -1804,7 +1836,8 @@ struct test_unary : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG || - op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU; + op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU || + op == GGML_UNARY_OP_EXPM1 || op == GGML_UNARY_OP_SOFTPLUS; ggml_tensor * a; if (v & 1) { @@ -2779,7 +2812,7 @@ struct test_bin_bcast : public test_case { const std::array nr; int nf; // number of fused ops, nf == 1 -> single op (no fusion) - bool run_whole_graph() override { return true; } + bool run_whole_graph() override { return nf > 1; } std::string vars() override { return VARS_TO_STR4(type, ne, nr, nf); @@ -5395,6 +5428,7 @@ struct test_pad : public test_case { } }; +// GGML_OP_PAD (with extension) struct test_pad_ext : public test_case { const ggml_type type; const std::array ne_a; @@ -5802,6 +5836,7 @@ struct test_opt_step_adamw : public test_case { } }; +// GGML_OP_OPT_STEP_SGD struct test_opt_step_sgd : public test_case { const ggml_type type; const std::array ne; @@ -5841,6 +5876,170 @@ struct test_opt_step_sgd : public test_case { } }; +// GGML_OP_CUMSUM +struct test_cumsum : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { return VARS_TO_STR2(type, ne); } + + test_cumsum(ggml_type type = GGML_TYPE_F32, + std::array ne = { 10, 5, 4, 3 }) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_cumsum(ctx, a); + + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -1.0f, 1.0f); + } + } +}; + +// GGML_OP_XIELU +struct test_xielu : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { return VARS_TO_STR2(type, ne); } + + test_xielu(ggml_type type = GGML_TYPE_F32, + std::array ne = { 10, 5, 4, 3 }) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(a); + ggml_set_name(a, "a"); + + float alpha_n = 4.0f; + float alpha_p = 20.0f; + float beta = 0.5f; + float eps = 0.0000001f; + + ggml_tensor * out = ggml_xielu(ctx, a, alpha_n, alpha_p, beta, eps); + + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -1.0f, 1.0f); + } + } +}; + +// GGML_OP_TRI +struct test_tri : public test_case { + const ggml_type type; + const std::array ne; + const ggml_tri_type tri_type; + + std::string vars() override { return VARS_TO_STR3(type, ne, tri_type); } + + test_tri(ggml_tri_type tri_type, ggml_type type = GGML_TYPE_F32, + std::array ne = { 10, 10, 4, 3 }) + : type(type), ne(ne), tri_type(tri_type) { + GGML_ASSERT(ne[0] == ne[1]); + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_tri(ctx, a, tri_type); + + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -1.0f, 1.0f); + } + } +}; + +// GGML_OP_FILL +struct test_fill : public test_case { + const ggml_type type; + const std::array ne; + float c; + + std::string vars() override { return VARS_TO_STR3(type, ne, c); } + + test_fill(float c, ggml_type type = GGML_TYPE_F32, + std::array ne = { 10, 10, 4, 3 }) + : type(type), ne(ne), c(c) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_fill(ctx, a, c); + + ggml_set_name(out, "out"); + + return out; + } +}; + +// GGML_OP_SOLVE_TRI +struct test_solve_tri : public test_case { + const ggml_type type; + const std::array ne_lhs; + const std::array ne_rhs; + + std::string vars() override { return VARS_TO_STR3(type, ne_lhs, ne_rhs); } + + test_solve_tri(ggml_type type = GGML_TYPE_F32, + std::array ne_lhs = { 10, 10, 4, 3 }, + std::array ne_rhs = { 3, 10, 4, 3 } + ) + : type(type), ne_lhs(ne_lhs), ne_rhs(ne_rhs) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne_lhs[0], ne_lhs[1], ne_lhs[2], ne_lhs[3]); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne_rhs[0], ne_rhs[1], ne_rhs[2], ne_rhs[3]); + ggml_set_param(b); + ggml_set_name(b, "b"); + + ggml_tensor * out = ggml_solve_tri(ctx, a, b, true, true, false); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (strcmp(t->name, "a") == 0) { + // note: avoid zeros in the diagonal + init_tensor_tril(t, 0.1, 1.0f); + } else { + init_tensor_uniform(t, -1.0f, 1.0f); + } + } + } +}; + enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -6282,6 +6481,9 @@ static std::vector> make_test_cases_eval() { for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (int v : {0, 1}) { for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { + if (op == GGML_UNARY_OP_XIELU) { + continue; // need extra params, separate test + } test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v)); test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v)); } @@ -7339,6 +7541,26 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_arange()); test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); + test_cases.emplace_back(new test_cumsum()); + + test_cases.emplace_back(new test_xielu()); + + test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER)); + test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER_DIAG)); + test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER)); + test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG)); + + test_cases.emplace_back(new test_fill(0.0f)); + test_cases.emplace_back(new test_fill(2.0f, GGML_TYPE_F32, { 303, 207, 11, 3 })); + test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 })); + + test_cases.emplace_back(new test_solve_tri()); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); for (bool v : {false, true}) { test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));