y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
// get the 5-th bit and store it in qh at the right position
- qh |= ((xi0 & 0x10) >> 4) << (j + 0);
- qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+ qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
}
memcpy(&y[i].qh, &qh, sizeof(qh));
y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
// get the 5-th bit and store it in qh at the right position
- qh |= ((xi0 & 0x10) >> 4) << (j + 0);
- qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+ qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
}
memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
"ALIBI",
"CLAMP",
"CONV_1D",
+ "CONV_TRANSPOSE_1D",
"CONV_2D",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
"UPSCALE",
+ "CONV_1D_STAGE_0",
+ "CONV_1D_STAGE_1",
+
"FLASH_ATTN",
"FLASH_FF",
"FLASH_ATTN_BACK",
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"alibi(x)",
"clamp(x)",
"conv_1d(x)",
+ "conv_transpose_1d(x)",
"conv_2d(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
+ "conv_1d_stage_0(x)",
+ "conv_1d_stage_1(x)",
+
"flash_attn(x)",
"flash_ff(x)",
"flash_attn_back(x)",
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_1D ] = true;
+ p[GGML_OP_CONV_1D_STAGE_0 ] = true;
+ p[GGML_OP_CONV_1D_STAGE_1 ] = true;
p[GGML_OP_CONV_2D ] = true;
+ p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
return result;
}
-
// ggml_reshape
struct ggml_tensor * ggml_reshape(
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
}
-GGML_API struct ggml_tensor * ggml_conv_1d(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- struct ggml_tensor * b,
- int s0,
- int p0,
- int d0) {
- GGML_ASSERT(ggml_is_matrix(b));
+// im2col: [N, IC, IL] => [N, OL, IC*K]
+// a: [OC,IC, K]
+// b: [N, IC, IL]
+// result: [N, OL, IC*K]
+static struct ggml_tensor * ggml_conv_1d_stage_0(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int p0,
+ int d0) {
GGML_ASSERT(a->ne[1] == b->ne[1]);
bool is_node = false;
is_node = true;
}
+ const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
const int64_t ne[4] = {
- ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
- a->ne[2], 1, 1,
+ a->ne[1] * a->ne[0],
+ OL,
+ b->ne[2],
+ 1,
};
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
int32_t params[] = { s0, p0, d0 };
ggml_set_op_params(result, params, sizeof(params));
- result->op = GGML_OP_CONV_1D;
+ result->op = GGML_OP_CONV_1D_STAGE_0;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = b;
+
+ return result;
+}
+
+// ggml_conv_1d_stage_1
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// a: [OC, IC, K]
+// b: [N, OL, IC * K]
+// result: [N, OC, OL]
+static struct ggml_tensor * ggml_conv_1d_stage_1(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[4] = {
+ b->ne[1],
+ a->ne[2],
+ b->ne[2],
+ 1,
+ };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+ result->op = GGML_OP_CONV_1D_STAGE_1;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
return result;
}
+// ggml_conv_1d
+
+GGML_API struct ggml_tensor * ggml_conv_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int p0,
+ int d0) {
+ struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
+ result = ggml_conv_1d_stage_1(ctx, a, result);
+ return result;
+}
+
+// GGML_API struct ggml_tensor * ggml_conv_1d(
+// struct ggml_context * ctx,
+// struct ggml_tensor * a,
+// struct ggml_tensor * b,
+// int s0,
+// int p0,
+// int d0) {
+// GGML_ASSERT(ggml_is_matrix(b));
+// GGML_ASSERT(a->ne[1] == b->ne[1]);
+// bool is_node = false;
+
+// if (a->grad || b->grad) {
+// GGML_ASSERT(false); // TODO: implement backward
+// is_node = true;
+// }
+
+// const int64_t ne[4] = {
+// ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+// a->ne[2], 1, 1,
+// };
+// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+// int32_t params[] = { s0, p0, d0 };
+// ggml_set_op_params(result, params, sizeof(params));
+
+// result->op = GGML_OP_CONV_1D;
+// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+// result->src[0] = a;
+// result->src[1] = b;
+
+// return result;
+// }
+
// ggml_conv_1d_ph
struct ggml_tensor* ggml_conv_1d_ph(
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
}
+// ggml_conv_transpose_1d
+
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+ return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int p0,
+ int d0) {
+ GGML_ASSERT(ggml_is_matrix(b));
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
+ GGML_ASSERT(a->ne[3] == 1);
+
+ GGML_ASSERT(p0 == 0);
+ GGML_ASSERT(d0 == 1);
+
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[4] = {
+ ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+ a->ne[1], b->ne[2], 1,
+ };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+ int32_t params[] = { s0, p0, d0 };
+ ggml_set_op_params(result, params, sizeof(params));
+
+ result->op = GGML_OP_CONV_TRANSPOSE_1D;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = b;
+
+ return result;
+}
+
// ggml_conv_2d
struct ggml_tensor * ggml_conv_2d(
// ggml_compute_forward_conv_1d
-static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
+static void ggml_compute_forward_conv_1d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int nth = params->nth;
const int nk = ne00;
- const int nh = nk/2;
- const int ew0 = ggml_up32(ne01);
+ // size of the convolution row - the kernel size unrolled across all input channels
+ const int ew0 = nk*ne01;
+
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
- GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
- // TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
- // prepare kernel data (src0)
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
- ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
- for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ew0 + i01] = src[i00];
- }
- }
- }
- }
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
+ ggml_fp16_t * dst_data = wdata;
- // prepare source data (src1)
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
+ for (int64_t ik = 0; ik < nk; ik++) {
+ const int idx0 = i0*s0 + ik*d0 - p0;
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- ggml_fp16_t * dst_data = wdata;
- for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
+ if(!(idx0 < 0 || idx0 >= ne10)) {
+ dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
+ }
}
}
}
}
// total rows in dst
- const int nr = ne02;
+ const int nr = ne2;
// rows per thread
const int dr = (nr + nth - 1)/nth;
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
- for (int64_t i0 = 0; i0 < ne10; ++i0) {
- dst_data[i0] = 0;
- for (int k = -nh; k <= nh; k++) {
- float v = 0.0f;
- ggml_vec_dot_f16(ew0, &v,
- (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0,
- (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
- dst_data[i0] += v;
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+ for (int i2 = 0; i2 < ne2; i2++) {
+ for (int i1 = ir0; i1 < ir1; i1++) {
+ float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+ for (int i0 = 0; i0 < ne0; i0++) {
+ ggml_vec_dot_f16(ew0, dst_data + i0,
+ (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
+ (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
}
}
}
}
-static void ggml_compute_forward_conv_1d_s1_ph_f32(
+static void ggml_compute_forward_conv_1d_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int nth = params->nth;
const int nk = ne00;
- const int nh = nk/2;
- const int ew0 = ggml_up32(ne01);
+ const int ew0 = nk*ne01;
+
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
- GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
- // TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
- // prepare kernel data (src0)
- {
- float * const wdata = (float *) params->wdata + 0;
+ float * const wdata = (float *) params->wdata + 0;
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
- float * dst_data = wdata + i02*ew0*ne00;
- for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ew0 + i01] = src[i00];
- }
- }
- }
- }
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
+ float * dst_data = wdata;
- // prepare source data (src1)
- {
- float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
+ for (int64_t ik = 0; ik < nk; ik++) {
+ const int idx0 = i0*s0 + ik*d0 - p0;
- for (int64_t i11 = 0; i11 < ne11; i11++) {
- const float * const src = (float *)((char *) src1->data + i11*nb11);
- float * dst_data = wdata;
- for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+ if(!(idx0 < 0 || idx0 >= ne10)) {
+ dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
+ }
}
}
}
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
- for (int i1 = ir0; i1 < ir1; i1++) {
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
- for (int64_t i0 = 0; i0 < ne10; ++i0) {
- dst_data[i0] = 0;
- for (int k = -nh; k <= nh; k++) {
- float v = 0.0f;
- ggml_vec_dot_f32(ew0, &v,
- (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0,
- (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
- dst_data[i0] += v;
+ float * const wdata = (float *) params->wdata + 0;
+
+ for (int i2 = 0; i2 < ne2; i2++) {
+ for (int i1 = ir0; i1 < ir1; i1++) {
+ float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+ for (int i0 = 0; i0 < ne0; i0++) {
+ ggml_vec_dot_f32(ew0, dst_data + i0,
+ (float *) ((char *) src0->data + i1*nb02),
+ (float *) wdata + i2*nb2 + i0*ew0);
+ }
+ }
+ }
+}
+
+static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
+ ggml_fp16_t * A,
+ ggml_fp16_t * B,
+ float * C,
+ const int ith, const int nth) {
+ // does not seem to make a difference
+ int64_t m0, m1, n0, n1;
+ // patches per thread
+ if (m > n) {
+ n0 = 0;
+ n1 = n;
+
+ // total patches in dst
+ const int np = m;
+
+ // patches per thread
+ const int dp = (np + nth - 1)/nth;
+
+ // patch range for this thread
+ m0 = dp*ith;
+ m1 = MIN(m0 + dp, np);
+ } else {
+ m0 = 0;
+ m1 = m;
+
+ // total patches in dst
+ const int np = n;
+
+ // patches per thread
+ const int dp = (np + nth - 1)/nth;
+
+ // patch range for this thread
+ n0 = dp*ith;
+ n1 = MIN(n0 + dp, np);
+ }
+
+ // block-tiling attempt
+ int64_t blck_n = 16;
+ int64_t blck_m = 16;
+
+ // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
+ // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
+ // if (blck_size > 0) {
+ // blck_0 = 4;
+ // blck_1 = blck_size / blck_0;
+ // if (blck_1 < 0) {
+ // blck_1 = 1;
+ // }
+ // // blck_0 = (int64_t)sqrt(blck_size);
+ // // blck_1 = blck_0;
+ // }
+ // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
+
+ for (int j = n0; j < n1; j+=blck_n) {
+ for (int i = m0; i < m1; i+=blck_m) {
+ // printf("i j k => %d %d %d\n", i, j, K);
+ for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+ for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+ ggml_vec_dot_f16(k,
+ C + ii*n + jj,
+ A + ii * k,
+ B + jj * k);
+ }
}
}
}
}
-static void ggml_compute_forward_conv_1d_s1_ph(
+// src0: kernel [OC, IC, K]
+// src1: signal [N, IC, IL]
+// dst: result [N, OL, IC*K]
+static void ggml_compute_forward_conv_1d_stage_0_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- switch (src0->type) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+ int64_t t0 = ggml_perf_time_us();
+ UNUSED(t0);
+
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ const int64_t N = ne12;
+ const int64_t IC = ne11;
+ const int64_t IL = ne10;
+
+ const int64_t K = ne00;
+
+ const int64_t OL = ne1;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ if (params->type == GGML_TASK_INIT) {
+ memset(dst->data, 0, ggml_nbytes(dst));
+ return;
+ }
+
+ if (params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // im2col: [N, IC, IL] => [N, OL, IC*K]
+ {
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+ for (int64_t in = 0; in < N; in++) {
+ for (int64_t iol = 0; iol < OL; iol++) {
+ for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+ // micro kernel
+ ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
+ const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
+
+ for (int64_t ik = 0; ik < K; ik++) {
+ const int64_t iil = iol*s0 + ik*d0 - p0;
+
+ if (!(iil < 0 || iil >= IL)) {
+ dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// src0: [OC, IC, K]
+// src1: [N, OL, IC * K]
+// result: [N, OC, OL]
+static void ggml_compute_forward_conv_1d_stage_1_f16(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ int64_t t0 = ggml_perf_time_us();
+ UNUSED(t0);
+
+ if (params->type == GGML_TASK_INIT) {
+ return;
+ }
+
+ if (params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb0 == sizeof(float));
+
+ const int N = ne12;
+ const int OL = ne11;
+
+ const int OC = ne02;
+ const int IC = ne01;
+ const int K = ne00;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ int64_t m = OC;
+ int64_t n = OL;
+ int64_t k = IC * K;
+
+ // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+ for (int i = 0; i < N; i++) {
+ ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+ ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+ float * C = (float *)dst->data + i * m * n; // [m, n]
+
+ gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+ }
+}
+
+static void ggml_compute_forward_conv_1d(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch(src0->type) {
case GGML_TYPE_F16:
{
- ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
+ ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
{
- ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
+ ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+static void ggml_compute_forward_conv_1d_stage_0(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch(src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
} break;
default:
{
}
}
-static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
+static void ggml_compute_forward_conv_1d_stage_1(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch(src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+// ggml_compute_forward_conv_transpose_1d
+
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int ith = params->ith;
const int nth = params->nth;
- const int nk = ne00;
- const int nh = nk/2;
-
- const int ew0 = ggml_up32(ne01);
+ const int nk = ne00*ne01*ne02;
- GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
- // TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
- // prepare kernel data (src0)
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
- ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ew0 + i01] = src[i00];
+ dst_data[i00*ne02 + i02] = src[i00];
}
}
}
}
- // prepare source data (src1)
+ // permute source data (src1) from (L x Cin) to (Cin x L)
{
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+ ggml_fp16_t * dst_data = wdata;
for (int64_t i11 = 0; i11 < ne11; i11++) {
const float * const src = (float *)((char *) src1->data + i11*nb11);
- ggml_fp16_t * dst_data = wdata;
for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
}
}
}
return;
}
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
// total rows in dst
- const int nr = ne02;
+ const int nr = ne1;
// rows per thread
const int dr = (nr + nth - 1)/nth;
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+ ggml_fp16_t * const wdata_src = wdata + nk;
+
for (int i1 = ir0; i1 < ir1; i1++) {
float * dst_data = (float *)((char *) dst->data + i1*nb1);
- for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
- dst_data[i0/2] = 0;
- for (int k = -nh; k <= nh; k++) {
- float v = 0.0f;
- ggml_vec_dot_f16(ew0, &v,
- (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0,
- (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
- dst_data[i0/2] += v;
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+ for (int i10 = 0; i10 < ne10; i10++) {
+ const int i1n = i10*ne11;
+ for (int i00 = 0; i00 < ne00; i00++) {
+ float v = 0;
+ ggml_vec_dot_f16(ne02, &v,
+ (ggml_fp16_t *) wdata_src + i1n,
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
+ dst_data[i10*s0 + i00] += v;
}
}
}
}
-static void ggml_compute_forward_conv_1d_s2_ph_f32(
+static void ggml_compute_forward_conv_transpose_1d_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const int ith = params->ith;
const int nth = params->nth;
- const int nk = ne00;
- const int nh = nk/2;
-
- const int ew0 = ggml_up32(ne01);
+ const int nk = ne00*ne01*ne02;
- GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
- // TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
- // prepare kernel data (src0)
+ // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
{
float * const wdata = (float *) params->wdata + 0;
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
- float * dst_data = wdata + i02*ew0*ne00;
+ float * dst_data = wdata + i01*ne00*ne02;
for (int64_t i00 = 0; i00 < ne00; i00++) {
- dst_data[i00*ew0 + i01] = src[i00];
+ dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
}
}
}
// prepare source data (src1)
{
- float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+ float * const wdata = (float *) params->wdata + nk;
+ float * dst_data = wdata;
for (int64_t i11 = 0; i11 < ne11; i11++) {
const float * const src = (float *)((char *) src1->data + i11*nb11);
- float * dst_data = wdata;
for (int64_t i10 = 0; i10 < ne10; i10++) {
- dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+ dst_data[i10*ne11 + i11] = src[i10];
}
}
}
return;
}
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
// total rows in dst
- const int nr = ne02;
+ const int nr = ne1;
// rows per thread
const int dr = (nr + nth - 1)/nth;
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
+ float * const wdata = (float *) params->wdata + 0;
+ float * const wdata_src = wdata + nk;
+
for (int i1 = ir0; i1 < ir1; i1++) {
float * dst_data = (float *)((char *) dst->data + i1*nb1);
- for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
- dst_data[i0/2] = 0;
- for (int k = -nh; k <= nh; k++) {
- float v = 0.0f;
- ggml_vec_dot_f32(ew0, &v,
- (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0,
- (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
- dst_data[i0/2] += v;
+ float * wdata_kernel = wdata + i1*ne02*ne00;
+ for (int i10 = 0; i10 < ne10; i10++) {
+ const int i1n = i10*ne11;
+ for (int i00 = 0; i00 < ne00; i00++) {
+ float v = 0;
+ ggml_vec_dot_f32(ne02, &v,
+ wdata_src + i1n,
+ wdata_kernel + i00*ne02);
+ dst_data[i10*s0 + i00] += v;
}
}
}
}
-static void ggml_compute_forward_conv_1d_s2_ph(
+static void ggml_compute_forward_conv_transpose_1d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
switch (src0->type) {
case GGML_TYPE_F16:
{
- ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
{
- ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
+ ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_conv_1d
-
-static void ggml_compute_forward_conv_1d(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
- GGML_ASSERT(d0 == 1); // dilation not supported
- GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
- if (s0 == 1) {
- ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
- } else if (s0 == 2) {
- ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
- } else {
- GGML_ASSERT(false); // only stride 1 and 2 supported
- }
-}
-
// ggml_compute_forward_conv_2d
static void ggml_compute_forward_conv_2d_f16_f32(
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- for (int i12 = 0; i12 < ne12; i12++) {
- const float * const src = (float *)((char *) src1->data + i12*nb12);
- ggml_fp16_t * dst_data = wdata;
-
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- for (int ik1 = 0; ik1 < nk1; ik1++) {
- for (int ik0 = 0; ik0 < nk0; ik0++) {
- const int idx0 = i0*s0 + ik0*d0 - p0;
- const int idx1 = i1*s1 + ik1*d1 - p1;
-
- if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
- dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
- GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+ for (int i13 = 0; i13 < ne13; i13++) {
+ for (int i12 = 0; i12 < ne12; i12++) {
+ const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
+ ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
+ const int idx0 = i0*s0 + ik0*d0 - p0;
+ const int idx1 = i1*s1 + ik1*d1 - p1;
+
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+ }
}
}
}
{
ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
} break;
+ case GGML_OP_CONV_1D_STAGE_0:
+ {
+ ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
+ case GGML_OP_CONV_1D_STAGE_1:
+ {
+ ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ {
+ ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_CONV_1D_STAGE_0:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
+ case GGML_OP_CONV_1D_STAGE_1:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_CONV_2D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
GGML_ASSERT(false); // TODO: not implemented
GGML_ASSERT(node->src[1]->ne[2] == 1);
GGML_ASSERT(node->src[1]->ne[3] == 1);
+ const int64_t ne00 = node->src[0]->ne[0];
+ const int64_t ne01 = node->src[0]->ne[1];
+ const int64_t ne02 = node->src[0]->ne[2];
+
+ const int64_t ne10 = node->src[1]->ne[0];
+ const int64_t ne11 = node->src[1]->ne[1];
+
+ const int64_t ne0 = node->ne[0];
+ const int64_t ne1 = node->ne[1];
+ const int64_t nk = ne00;
+ const int64_t ew0 = nk * ne01;
+
+ UNUSED(ne02);
+ UNUSED(ne10);
+ UNUSED(ne11);
+
size_t cur = 0;
- const int nk = node->src[0]->ne[0];
if (node->src[0]->type == GGML_TYPE_F16 &&
- node->src[1]->type == GGML_TYPE_F32) {
- cur = sizeof(ggml_fp16_t)*(
- nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
- ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
- );
+ node->src[1]->type == GGML_TYPE_F32) {
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+ } else if (node->src[0]->type == GGML_TYPE_F32 &&
+ node->src[1]->type == GGML_TYPE_F32) {
+ cur = sizeof(float)*(ne0*ne1*ew0);
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ work_size = MAX(work_size, cur);
+ } break;
+ case GGML_OP_CONV_1D_STAGE_0:
+ {
+ n_tasks = n_threads;
+ } break;
+ case GGML_OP_CONV_1D_STAGE_1:
+ {
+ n_tasks = n_threads;
+ } break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ {
+ n_tasks = n_threads;
+
+ GGML_ASSERT(node->src[0]->ne[3] == 1);
+ GGML_ASSERT(node->src[1]->ne[2] == 1);
+ GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+ const int64_t ne00 = node->src[0]->ne[0]; // K
+ const int64_t ne01 = node->src[0]->ne[1]; // Cout
+ const int64_t ne02 = node->src[0]->ne[2]; // Cin
+
+ const int64_t ne10 = node->src[1]->ne[0]; // L
+ const int64_t ne11 = node->src[1]->ne[1]; // Cin
+
+ size_t cur = 0;
+ if (node->src[0]->type == GGML_TYPE_F16 &&
+ node->src[1]->type == GGML_TYPE_F32) {
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+ cur += sizeof(ggml_fp16_t)*ne10*ne11;
} else if (node->src[0]->type == GGML_TYPE_F32 &&
- node->src[1]->type == GGML_TYPE_F32) {
- cur = sizeof(float)*(
- nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
- ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
- );
+ node->src[1]->type == GGML_TYPE_F32) {
+ cur += sizeof(float)*ne00*ne01*ne02;
+ cur += sizeof(float)*ne10*ne11;
} else {
GGML_ASSERT(false);
}
if (callback) {
callback(callback_data, accum_step, &sched, &cancel);
if (cancel) {
- break;
+ return GGML_OPT_CANCEL;
}
}
// ggml_graph_reset (gf);
ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0);
}
- if (cancel) {
- return GGML_OPT_DID_NOT_CONVERGE;
- }
fx *= accum_norm;
opt->adam.fx_prev = fx;
// run the optimizer
for (int t = 0; t < params.adam.n_iter; ++t) {
- if (cancel) {
- break;
- }
opt->iter = iter0 + t + 1;
GGML_PRINT_DEBUG ("=== iter %d ===\n", t);
if (callback) {
callback(callback_data, accum_step, &sched, &cancel);
if (cancel) {
- break;
+ return GGML_OPT_CANCEL;;
}
}
// ggml_graph_reset (gf);
ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0);
}
- if (cancel) {
- break;
- }
fx *= accum_norm;
opt->loss_after = fx;
finit = *fx;
dgtest = params->lbfgs.ftol*dginit;
- while (!*cancel) {
+ while (true) {
ggml_vec_cpy_f32(nx, x, xp);
ggml_vec_mad_f32(nx, x, d, *step);
float sched = 0;
callback(callback_data, accum_step, &sched, cancel);
if (*cancel) {
- break;
+ return GGML_OPT_CANCEL;
}
}
// ggml_graph_reset (gf);
ggml_opt_acc_grad(np, ps, g, accum_norm);
*fx += ggml_get_f32_1d(f, 0);
}
- if (*cancel) {
- break;
- }
*fx *= accum_norm;
}
float sched = 0;
callback(callback_data, accum_step, &sched, &cancel);
if (cancel) {
- break;
+ return GGML_OPT_CANCEL;
}
}
// ggml_graph_reset (gf);
ggml_opt_acc_grad(np, ps, g, accum_norm);
fx += ggml_get_f32_1d(f, 0);
}
- if (cancel) {
- return GGML_OPT_DID_NOT_CONVERGE;
- }
fx *= accum_norm;
opt->loss_before = fx;
ggml_vec_cpy_f32(nx, gp, g);
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
- if (!cancel) {
- break;
+ if (cancel) {
+ return GGML_OPT_CANCEL;
}
if (ls < 0) {