};
}
-// ggml_compute_forward_conv_2d_sk_p0
+// ggml_compute_forward_conv_2d
-static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
+static void ggml_compute_forward_conv_2d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
// size of the convolution row - the kernel size unrolled across all channels
const int ew0 = nk0*nk1*ne02;
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb10 == sizeof(float));
-
- if (params->type == GGML_TASK_INIT) {
- // TODO: fix this memset (wsize is overestimated)
- memset(params->wdata, 0, params->wsize);
-
- // prepare source data (src1)
- {
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
- for (int i13 = 0; i13 < ne13; i13++) {
- for (int i12 = 0; i12 < ne12; i12++) {
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
-
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- for (int ik1 = 0; ik1 < nk1; ik1++) {
- for (int ik0 = 0; ik0 < nk0; ik0++) {
- dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
- }
- }
- }
- }
- }
- }
- }
-
- return;
- }
-
- if (params->type == GGML_TASK_FINALIZE) {
- return;
- }
-
- // total patches in dst
- const int np = ne2;
-
- // patches per thread
- const int dp = (np + nth - 1)/nth;
-
- // patch range for this thread
- const int ip0 = dp*ith;
- const int ip1 = MIN(ip0 + dp, np);
-
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
- for (int i3 = 0; i3 < ne3; i3++) {
- for (int i2 = ip0; i2 < ip1; i2++) {
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
-
- for (int i1 = 0; i1 < ne1; ++i1) {
- for (int i0 = 0; i0 < ne0; ++i0) {
- ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
- (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
- (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_conv_2d_sk_p0(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- switch (src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
- } break;
- case GGML_TYPE_F32:
- {
- //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
- GGML_ASSERT(false);
- } break;
- default:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-// ggml_compute_forward_conv_2d_any
-
-static void ggml_compute_forward_conv_2d_f16_f32(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- const struct ggml_tensor * opt0,
- struct ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nk0 = ne00;
- const int nk1 = ne01;
-
- const int ew0 = nk0*nk1*ne02;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
for (int i0 = 0; i0 < ne0; i0++) {
for (int ik1 = 0; ik1 < nk1; ik1++) {
for (int ik0 = 0; ik0 < nk0; ik0++) {
- const int idx1 = i1*s1 + ik1*d1 - p1;
const int idx0 = i0*s0 + ik0*d0 - p0;
-
+ const int idx1 = i1*s1 + ik1*d1 - p1;
+
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
- GGML_FP32_TO_FP16(src[idx1 * ne10 + idx0]);
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
}
}
}
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
- for (int i2 = ip0; i2 < ip1; i2++) {
- float * dst_data = (float *)((char *) dst->data + i2*nb2);
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ip0; i2 < ip1; i2++) {
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
- for (int i1 = 0; i1 < ne1; ++i1) {
- for (int i0 = 0; i0 < ne0; ++i0) {
- ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
- (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
- (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
+ for (int i1 = 0; i1 < ne1; ++i1) {
+ for (int i0 = 0; i0 < ne0; ++i0) {
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+ (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
+ }
}
-
}
}
}
-static void ggml_compute_forward_conv_2d_any(
+static void ggml_compute_forward_conv_2d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst
- ) {
+ ) {
switch (src0->type) {
case GGML_TYPE_F16:
{
GGML_ASSERT(false);
} break;
}
-
-}
-
-// ggml_compute_forward_conv_2d
-
-static void ggml_compute_forward_conv_2d(
- const struct ggml_compute_params* params,
- const struct ggml_tensor* src0,
- const struct ggml_tensor* src1,
- const struct ggml_tensor* opt0,
- struct ggml_tensor* dst) {
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
-
- if (s0 == src0->ne[0] && s1 == src0->ne[1] && p0 == 0 && p1 == 0) {
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
- }
- else {
- ggml_compute_forward_conv_2d_any(params, src0, src1, opt0, dst);
- };
}
// ggml_compute_forward_pool_1d_sk_p0