From: Georgi Gerganov Date: Wed, 18 Dec 2024 17:27:21 +0000 (+0200) Subject: tts : add OuteTTS support (llama/10784) X-Git-Tag: upstream/1.7.4~26 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f32ddb3b1c12d753f9e2aa4d063a6c4a5000d12d;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp tts : add OuteTTS support (llama/10784) * server : add "tokens" output ggml-ci * server : output embeddings for all tokens when pooling = none ggml-ci * server : be explicit about the pooling type in the tests ggml-ci * server : do not normalize embeddings when there is no pooling ggml-ci * llama : add OuteTTS support (wip) * wip * extract features * first conv * group norm * resnet conv * resnet * attn * pos net * layer norm * convnext * head * hann window * fix n_embd + remove llama.cpp hacks * compute hann window * fft * spectrum processing * clean-up * tts : receive input text and generate codes * clip : fix new conv name * tts : minor fix * tts : add header + minor fixes ggml-ci * tts : add matchematical constant ggml-ci * tts : fix sampling + cut initial noise * tts : fixes * tts : update default samplers ggml-ci * tts : text pre-processing * tts : outetts-voc -> wavtokenizer-dec * tts : remove hardcoded constants ggml-ci * tts : fix tensor shapes * llama : refactor wavtokenizer tensors ggml-ci * cont ggml-ci * cont [no ci] * llama : update WavTokenizer to non-causal attn * llama : handle no-vocab detokenization * tts : add Python example for OuteTTS (wip) * tts : extend python example to generate spectrogram ggml-ci * server : fix rebase artifacts * tts : enable "return_tokens" in Python example ggml-ci * tts : minor fixes * common : support HF download for vocoder --- diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b0c1ac9c..c714fc8c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1564,17 +1564,6 @@ extern "C" { int d1, // dilation dimension 1 bool is_2D); - GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1); // dilation dimension 1 - GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1592,6 +1581,23 @@ extern "C" { int s, // stride int d); // dilation + // depthwise + // TODO: this is very likely wrong for some cases! - needs more testing + GGML_API struct ggml_tensor * ggml_conv_1d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int d0); // dilation + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1611,7 +1617,6 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 - // kernel size is a->ne[0] x a->ne[1] // stride is equal to kernel size // padding is zero @@ -1638,6 +1643,18 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // depthwise + GGML_API struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0efd2b2e..2bbe5f48 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3760,13 +3760,84 @@ struct ggml_tensor * ggml_clamp( return result; } -// ggml_conv_1d - static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -GGML_API struct ggml_tensor * ggml_conv_1d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +struct ggml_tensor * ggml_im2col( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D, + enum ggml_type dst_type) { + if (is_2D) { + GGML_ASSERT(a->ne[2] == b->ne[2]); + } else { + //GGML_ASSERT(b->ne[1] % a->ne[1] == 0); + GGML_ASSERT(b->ne[1] == a->ne[1]); + GGML_ASSERT(b->ne[3] == 1); + } + + const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + + GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a"); + GGML_ASSERT((OW > 0) && "b too small compared to a"); + + const int64_t ne[4] = { + is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0], + OW, + is_2D ? OH : b->ne[2], + is_2D ? b->ne[3] : 1, + }; + + struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_IM2COL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_im2col_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t * ne, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D) { + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_IM2COL_BACK; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d + +struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -3796,137 +3867,75 @@ struct ggml_tensor* ggml_conv_1d_ph( return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } -// ggml_conv_transpose_1d - -static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { - return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; -} +// ggml_conv_1d_dw -GGML_API struct ggml_tensor * ggml_conv_transpose_1d( +struct ggml_tensor * ggml_conv_1d_dw( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, int p0, int d0) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[2] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]); + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]); - GGML_ASSERT(p0 == 0); - GGML_ASSERT(d0 == 1); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); - const int64_t ne[4] = { - ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), - a->ne[1], b->ne[2], 1, - }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a); - int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->src[0] = a; - result->src[1] = b; + result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1); return result; } -// ggml_conv_depthwise +// ggml_conv_1d_dw_ph -struct ggml_tensor * ggml_conv_depthwise_2d( +struct ggml_tensor * ggml_conv_1d_dw_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { - struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); - struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, - ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), - s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] - struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + int d0) { + return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); +} - new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] - struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] +// ggml_conv_transpose_1d - return result; +static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; } -// ggml_conv_2d -// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] -// a: [OC,IC, KH, KW] -// b: [N, IC, IH, IW] -// result: [N, OH, OW, IC*KH*KW] -struct ggml_tensor * ggml_im2col( +GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, - int s1, int p0, - int p1, - int d0, - int d1, - bool is_2D, - enum ggml_type dst_type) { - if(is_2D) { - GGML_ASSERT(a->ne[2] == b->ne[2]); - } else { - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(b->ne[3] == 1); - } - - const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; - const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + int d0) { + GGML_ASSERT(ggml_is_matrix(b)); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(a->ne[3] == 1); - GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a"); - GGML_ASSERT((OW > 0) && "b too small compared to a"); + GGML_ASSERT(p0 == 0); + GGML_ASSERT(d0 == 1); const int64_t ne[4] = { - is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0], - OW, - is_2D ? OH : b->ne[2], - is_2D ? b->ne[3] : 1, + ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), + a->ne[1], b->ne[2], 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL; + result->op = GGML_OP_CONV_TRANSPOSE_1D; result->src[0] = a; result->src[1] = b; return result; } -struct ggml_tensor * ggml_im2col_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int64_t * ne, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D) { - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_IM2COL_BACK; - result->src[0] = a; - result->src[1] = b; - - return result; -} +// ggml_conv_2d // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] @@ -3973,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph( return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); } +// ggml_conv_2d_dw + +struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + + new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {