From: David Miller Date: Fri, 14 Jul 2023 11:38:47 +0000 (-0700) Subject: ggml : general conv_2d CPU implementation (#352) X-Git-Tag: upstream/0.0.1642~1318 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=a1bd9826b6ccc4900ad0225aa76d36005ebeca0c;p=pkg%2Fggml%2Fsources%2Fggml ggml : general conv_2d CPU implementation (#352) * Conv2d s0 == s1 == 1, d0 == d1 == 1, variable padding * Mark unused varibles * Support variable strides * Handle all non-kernel-width convolutions with same general conv2d * General 2d Conv * Remove old function * Group functions * Rearrange * General Conv2d implementation * Clean up whitespace --- diff --git a/src/ggml.c b/src/ggml.c index 7a9f84ef..e45f8ef8 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -13102,6 +13102,125 @@ static void ggml_compute_forward_conv_2d_sk_p0( } break; } } +// ggml_compute_forward_conv_2d_any + +static void ggml_compute_forward_conv_2d_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t s1 = ((const int32_t*)(opt0->data))[1]; + const int32_t p0 = ((const int32_t*)(opt0->data))[2]; + const int32_t p1 = ((const int32_t*)(opt0->data))[3]; + const int32_t d0 = ((const int32_t*)(opt0->data))[4]; + const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk0 = ne00; + const int nk1 = ne01; + + const int ew0 = nk0*nk1*ne02; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx1 = i1*s1 + ik1*d1 - p1; + const int idx0 = i0*s0 + ik0*d0 - p0; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1 * ne10 + idx0]); + } + } + } + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2); + + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0); + } + + } + } +} + +static void ggml_compute_forward_conv_2d_any( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst + ) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F32: + { + //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } + +} // ggml_compute_forward_conv_2d @@ -13117,16 +13236,13 @@ static void ggml_compute_forward_conv_2d( const int32_t p1 = ((const int32_t*)(opt0->data))[3]; const int32_t d0 = ((const int32_t*)(opt0->data))[4]; const int32_t d1 = ((const int32_t*)(opt0->data))[5]; - GGML_ASSERT(d0 == 1); // dilation not supported - GGML_ASSERT(d1 == 1); - GGML_ASSERT(p0 == 0); // padding not supported - GGML_ASSERT(p1 == 0); - if (s0 == src0->ne[0] && s1 == src0->ne[1]) { + if (s0 == src0->ne[0] && s1 == src0->ne[1] && p0 == 0 && p1 == 0) { ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); - } else { - GGML_ASSERT(false); // only stride equal to kernel size is supported } + else { + ggml_compute_forward_conv_2d_any(params, src0, src1, opt0, dst); + }; } // ggml_compute_forward_pool_1d_sk_p0 @@ -16585,19 +16701,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // C + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; - UNUSED(ne02); UNUSED(ne03); - UNUSED(nk); + UNUSED(ne2); size_t cur = 0; if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); } else { GGML_ASSERT(false);