// Context for binary operations
struct htp_binary_context {
struct htp_ops_context * octx;
- struct fastdiv_values dim1_div;
- struct fastdiv_values dim2_div;
- struct fastdiv_values dim12_div;
+
+ struct fastdiv_values src0_dim1_div; // ne01
+ struct fastdiv_values src0_dim2_div; // ne02
+ struct fastdiv_values src0_dim12_div;// ne03
struct fastdiv_values src1_dim1_div; // ne11
struct fastdiv_values src1_dim2_div; // ne12
struct fastdiv_values src1_dim3_div; // ne13
- uint32_t nrows_per_thread;
- bool split_at_ne01;
- bool split_at_ne02;
-
- // Precomputed values
uint32_t block_max;
+ uint32_t nrows_per_thread;
size_t src0_row_size_aligned;
size_t src1_row_size_aligned;
size_t dst_row_size_aligned;
- uint32_t src1_fetch_rows; // 1 or block_max
- uint32_t src1_dma_stride; // 0 or stride
+
+ bool split_at_ne01;
+ bool split_at_ne02;
};
-#define htp_binary_preamble \
+#define htp_binary_preamble \
const struct htp_tensor * src0 = &octx->src0; \
const struct htp_tensor * src1 = &octx->src1; \
struct htp_tensor * dst = &octx->dst; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
-static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row,
- uint32_t ne01, uint32_t ne02) {
+static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row, uint32_t ne01, uint32_t ne02) {
uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
+ i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
+ i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint32_t rows_left = end_row - ir;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
+ FARF(HIGH, "binary-scalar: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
+
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+ i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
+ i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
+ i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
+ i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
// src1 indices (broadcast/repeat)
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+ p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
+ p02 = fastdiv(prem, &bctx->src0_dim1_div);
p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
+ FARF(HIGH, "binary-same-shape: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
+
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+ i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
+ i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint32_t i13 = (ne13 == 1) ? 0 : i03;
uint32_t i11 = (ne11 == 1) ? 0 : i01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
- uint8_t * src1_base = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+ uint8_t * src1_curr = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * s1_spad = src1_spad_base + spad_idx * src1_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
- dma_queue_push(q, dma_make_ptr(s1_spad, src1_base), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, current_block_size);
+ dma_queue_push(q, dma_make_ptr(s1_spad, src1_curr), bctx->src1_row_size_aligned, nb11, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
}
for (uint32_t ir = start_row; ir < end_row; ) {
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
- uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+ uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint8_t * s1_spad = (uint8_t *) dma_queue_pop(q).dst;
}
uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
+ i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
+ i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+ p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
+ p02 = fastdiv(prem, &bctx->src0_dim1_div);
p01 = prem - p02 * ne01;
uint32_t p13 = (ne13 == 1) ? 0 : p03;
uint8_t * s1_next = (uint8_t *)src1->data + p13 * nb13 + p12 * nb12 + p11 * nb11;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
- dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, next_block_size);
+ dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, nb11, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
}
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
- const uint32_t src0_type = octx->src0.type;
+ const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
- const uint32_t start_row = bctx->nrows_per_thread * ith;
- const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
+ const uint32_t start_row = bctx->nrows_per_thread * ith;
+ const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
+ FARF(HIGH, "binary-row-bcast: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
+
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
- uint8_t * src1_spad = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
+ uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
uint32_t ir_prefetch = start_row;
int spad_idx = 0;
- void * s1_ptr = (void *) src1_spad;
+ void * s1_ptr = (void *) src1_spad_base;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
for (uint32_t ir = start_row; ir < end_row; ) {
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
- uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+ uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
for (uint32_t r = 0; r < current_block_size; r++) {
COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
}
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
- rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
+ uint32_t rem = ir - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
- p01 = prem - p02 * ne01;
+ uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
+ uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
+ uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
- const uint32_t start_row = bctx->nrows_per_thread * ith;
- const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
+ const uint32_t start_row = bctx->nrows_per_thread * ith;
+ const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
+ FARF(HIGH, "binary-complex: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
+
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
- size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
- size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
+ size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+ size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
- rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
+ uint32_t rem = ir - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r;
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
- p01 = prem - p02 * ne01;
+ uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
+ uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
+ uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
const uint32_t total_rows = ne01 * ne02 * ne03;
- const uint32_t start_row = bctx->nrows_per_thread * ith;
- const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
+ const uint32_t start_row = bctx->nrows_per_thread * ith;
+ const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
- size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
- size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
+ size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+ size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
+
+ FARF(HIGH, "binary-repeat: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
- rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
+ uint32_t rem = ir - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r;
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
- p01 = prem - p02 * ne01;
+ uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
+ uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
+ uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
const uint32_t nb02 = src0->nb[2];
const uint32_t nb03 = src0->nb[3];
const uint32_t nb11 = src1->nb[1]; // src1 row stride
+
const uint32_t nb1 = dst->nb[1];
const uint32_t nb2 = dst->nb[2];
const uint32_t nb3 = dst->nb[3];
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
- size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
- size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
+ size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+ size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- rem = ir_prefetch - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
- dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+ dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, ne00 * sizeof(float), 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
- uint32_t i03, i02, i01, rem;
- i03 = fastdiv(ir, &bctx->dim12_div);
- rem = ir - i03 * (ne02 * ne01);
- i02 = fastdiv(rem, &bctx->dim1_div);
- i01 = rem - i02 * ne01;
+ uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
+ uint32_t rem = ir - i03 * (ne02 * ne01);
+ uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
+ uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r; // linear within block since we split at ne01
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
- uint32_t p03, p02, p01, prem;
- p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
- prem = ir_prefetch - p03 * (ne02 * ne01);
- p02 = fastdiv(prem, &bctx->dim1_div);
- p01 = prem - p02 * ne01;
+ uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
+ uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
+ uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
+ uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), next_block_size);
ir_prefetch += next_block_size;
const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const size_t src0_row_size = src0->ne[0] * elem_size;
const size_t src1_row_size = src1->ne[0] * elem_size;
- const size_t dst_row_size = dst->ne[0] * elem_size;
+ const size_t dst_row_size = dst->ne[0] * elem_size;
- // Align to VLEN
- const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
- const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
+ size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
+ size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
bool is_add_id = (octx->op == HTP_OP_ADD_ID);
bool is_scalar = !is_add_id && (src1->ne[0] == 1);
- // Determine which kernel we will use to alloc memory and dispatch
- bool use_vector_same = !is_add_id && !is_scalar && ((src0->nb[1] % VLEN) == 0) && (src1->ne[0] == src0->ne[0]) &&
+ bool is_transposed = (src0->nb[1] < src0_row_size || src1->nb[1] < src1_row_size || dst->nb[1] < dst_row_size);
+
+ bool is_same_shape = !is_add_id && !is_scalar && !is_transposed &&
+ (src1->ne[0] == src0->ne[0] && src0->ne[0] % VLEN == 0) &&
(src1->ne[1] == src0->ne[1] || src1->ne[1] == 1) &&
(src1->ne[2] == src0->ne[2] || src1->ne[2] == 1) &&
(src1->ne[3] == src0->ne[3] || src1->ne[3] == 1);
- bool is_row_bcast = use_vector_same && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
- bool use_complex = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] == src0->ne[0]);
- bool use_repeat = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] != src0->ne[0]);
+ bool is_row_bcast = is_same_shape && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
+ bool is_complex = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] == src0->ne[0]);
+ bool is_repeat = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] != src0->ne[0]);
size_t spad_row_total;
- if (is_scalar) {
- spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
- } else if (is_row_bcast) {
- spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
- } else if (use_vector_same) {
+ if (is_same_shape) {
spad_row_total = 2 * (src0_row_size_aligned + src1_row_size_aligned + dst_row_size_aligned);
- } else if (is_add_id) {
- spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned); // src1 read directly
} else {
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
}
size_t rows_per_buffer = octx->ctx->vtcm_size / (n_threads * spad_row_total);
+
// Adjust for static src1 in row_bcast case
if (is_row_bcast) {
size_t needed_static = src1_row_size_aligned;
}
if (rows_per_buffer < 1) {
- FARF(ERROR, "binary: VTCM too small\n");
- return HTP_STATUS_VTCM_TOO_SMALL;
+ FARF(ERROR, "binary: VTCM too small\n");
+ return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.size_per_thread = rows_per_buffer * 2 * src0_row_size_aligned;
octx->dst_spad.size_per_thread = rows_per_buffer * 2 * dst_row_size_aligned;
- if (is_scalar || use_complex || use_repeat || is_add_id) {
- octx->src1_spad.size_per_thread = 0;
- } else if (is_row_bcast) {
+ if (is_add_id || is_scalar || is_complex || is_repeat || is_row_bcast) {
octx->src1_spad.size_per_thread = 0;
} else {
octx->src1_spad.size_per_thread = rows_per_buffer * 2 * src1_row_size_aligned;
}
+ octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
if (is_row_bcast) {
octx->src1_spad.size = src1_row_size_aligned;
} else {
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
}
- octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
if (octx->ctx->vtcm_size < (octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size)) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
struct htp_binary_context bctx;
- bctx.octx = octx;
- bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
- bctx.block_max = rows_per_buffer;
+ bctx.octx = octx;
+ bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
+ bctx.block_max = rows_per_buffer;
bctx.src0_row_size_aligned = src0_row_size_aligned;
bctx.src1_row_size_aligned = src1_row_size_aligned;
bctx.dst_row_size_aligned = dst_row_size_aligned;
- bctx.dim1_div = init_fastdiv_values(src0->ne[1]);
- bctx.dim2_div = init_fastdiv_values(src0->ne[2]);
- bctx.dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
+ bctx.src0_dim1_div = init_fastdiv_values(src0->ne[1]);
+ bctx.src0_dim2_div = init_fastdiv_values(src0->ne[2]);
+ bctx.src0_dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
- bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
- bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
- bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
+ bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
+ bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
+ bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
bool src0_contig_dim1 = (src0->nb[2] == src0->ne[1] * src0->nb[1]);
- bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
+ bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
bool src0_contig_dim2 = (src0->nb[3] == src0->ne[2] * src0->nb[2]);
- bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
-
- bctx.split_at_ne01 = (src0->ne[2] > 1) &&
- ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
+ bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
- bctx.split_at_ne02 = (src0->ne[3] > 1) &&
- ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
-
- // Precompute specific kernel parameters
- if (use_vector_same) {
- bctx.src1_dma_stride = (src1->ne[1] == 1) ? 0 : src1->nb[1];
- bctx.src1_fetch_rows = (src1->ne[1] == 1) ? 1 : rows_per_buffer;
- }
+ bctx.split_at_ne01 = (src0->ne[2] > 1) && ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
+ bctx.split_at_ne02 = (src0->ne[3] > 1) && ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
worker_callback_t worker_func;
- if (is_add_id) worker_func = binary_job_add_id;
- else if (is_scalar) worker_func = binary_job_scalar;
- else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
- else if (use_vector_same) worker_func = binary_job_vector_same_shape;
- else if (use_complex) worker_func = binary_job_vector_complex;
- else worker_func = binary_job_element_repeat;
+ if (is_add_id) worker_func = binary_job_add_id;
+ else if (is_scalar) worker_func = binary_job_scalar;
+ else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
+ else if (is_same_shape) worker_func = binary_job_vector_same_shape;
+ else if (is_complex) worker_func = binary_job_vector_complex;
+ else worker_func = binary_job_element_repeat;
if (is_row_bcast) {
dma_queue_pop(q);
extern "C" {
#endif
+// Define the HW descriptor structs here since the ones in HexSDK are a bit out of date
+typedef struct dma_descriptor_1d_s {
+ void * next;
+ uint32_t size:24;
+ uint32_t desc_size:2;
+ uint32_t dst_comp:1;
+ uint32_t src_comp:1;
+ uint32_t dst_bypass:1;
+ uint32_t src_bypass:1;
+ uint32_t order:1;
+ uint32_t done:1;
+ void * src;
+ void * dst;
+} dma_descriptor_1d;
+
+#if __HVX_ARCH__ < 75
+
+typedef struct dma_descriptor_2d_s {
+ void * next;
+ uint32_t reserved0:24;
+ uint32_t desc_size:2;
+ uint32_t dst_comp:1;
+ uint32_t src_comp:1;
+ uint32_t dst_bypass:1;
+ uint32_t src_bypass:1;
+ uint32_t order:1;
+ uint32_t done:1;
+ void * src;
+ void * dst;
+ uint32_t desc_type:8;
+ uint32_t reserved1:24;
+ uint32_t row_size:16;
+ uint32_t nrows:16;
+ uint32_t src_stride:16;
+ uint32_t dst_stride:16;
+ uint32_t src_offset:16;
+ uint32_t dst_offset:16;
+} dma_descriptor_2d;
+
+#else
+
+typedef struct dma_descriptor_2d_s {
+ void * next;
+ uint32_t dst_stride:24;
+ uint32_t desc_size:2;
+ uint32_t dst_comp:1;
+ uint32_t src_comp:1;
+ uint32_t dst_bypass:1;
+ uint32_t src_bypass:1;
+ uint32_t order:1;
+ uint32_t done:1;
+ void * src;
+ void * dst;
+ uint32_t desc_type:8;
+ uint32_t reserved0:24;
+ uint32_t row_size:24;
+ uint32_t nrows_lo:8;
+ uint32_t nrows_hi:8;
+ uint32_t src_stride:24;
+ uint32_t offset:24;
+ uint32_t reserved1:8;
+} dma_descriptor_2d;
+
+#endif
+
typedef struct {
- void *dst;
+ void *dst;
const void *src;
} dma_ptr;
typedef struct {
- hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
- hexagon_udma_descriptor_type1_t * tail; // tail pointer
- dma_ptr * dptr; // dst/src pointers
- uint32_t push_idx;
- uint32_t pop_idx;
- uint32_t capacity;
- uint32_t idx_mask;
+ dma_descriptor_2d * desc; // descriptor pointers
+ dma_descriptor_2d * tail; // tail pointer
+ dma_ptr * dptr; // dst/src pointers
+ uint32_t push_idx;
+ uint32_t pop_idx;
+ uint32_t capacity;
+ uint32_t idx_mask;
} dma_queue;
dma_queue * dma_queue_create(size_t capacity);
return p;
}
-static inline bool dma_queue_push(dma_queue * q,
- dma_ptr dptr,
- size_t dst_row_size,
- size_t src_row_size,
- size_t width, // width in bytes. number of bytes to transfer per row
- size_t nrows) {
+#if __HVX_ARCH__ < 73
+static const uint32_t dma_src_l2_bypass_on = 1;
+static const uint32_t dma_dst_l2_bypass_on = 0;
+#else
+static const uint32_t dma_src_l2_bypass_on = 1;
+static const uint32_t dma_dst_l2_bypass_on = 1;
+#endif
+
+static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
- FARF(ERROR, "dma-push: queue full\n");
+ FARF(HIGH, "dma-push: queue full\n");
return false;
}
- hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
+ dma_descriptor_1d * desc = (dma_descriptor_1d *) &q->desc[q->push_idx];
+ desc->next = NULL;
+ desc->desc_size = 0; // 1D mode
+ desc->src_bypass = dma_src_l2_bypass_on;
+ desc->dst_bypass = dma_dst_l2_bypass_on;
+ desc->order = 1;
+ desc->done = 0;
+ desc->src = (void *) dptr.src;
+ desc->dst = (void *) dptr.dst;
+ desc->size = size;
+
+ q->dptr[q->push_idx] = dptr;
+
+ dmlink(q->tail, desc);
+ q->tail = (dma_descriptor_2d *) desc;
+
+ // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
+ q->push_idx = (q->push_idx + 1) & q->idx_mask;
+ return true;
+}
+
+static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
+ if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+ FARF(HIGH, "dma-push: queue full\n");
+ return false;
+ }
+
+ dma_descriptor_2d * desc = &q->desc[q->push_idx];
desc->next = NULL;
- desc->length = 0;
- desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
- desc->dstbypass = 1;
- desc->srcbypass = 1;
-#if __HVX_ARCH__ >= 73
- desc->dstbypass = 1;
- desc->srcbypass = 1;
-#else
- desc->dstbypass = 0;
- desc->srcbypass = 1;
-#endif
- desc->order = 0;
- desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
+ desc->reserved0 = 0;
+ desc->reserved1 = 0;
+ desc->desc_size = 1; // 2d mode
+ desc->src_bypass = dma_src_l2_bypass_on;
+ desc->dst_bypass = dma_dst_l2_bypass_on;
+ desc->src_comp = 0;
+ desc->dst_comp = 0;
+ desc->order = 1;
+ desc->done = 0;
+ desc->src_stride = src_stride;
+ desc->dst_stride = dst_stride;
desc->src = (void *) dptr.src;
desc->dst = (void *) dptr.dst;
- desc->allocation = 0;
- desc->padding = 0;
- desc->roiwidth = width;
- desc->roiheight = nrows;
- desc->srcstride = src_row_size;
- desc->dststride = dst_row_size;
- desc->srcwidthoffset = 0;
- desc->dstwidthoffset = 0;
+ desc->row_size = row_size;
+
+#if __HVX_ARCH__ < 75
+ desc->desc_type = 0; // 2d (16-bit) mode
+ desc->nrows = nrows;
+ desc->src_offset = 0;
+ desc->dst_offset = 0;
+#else
+ desc->desc_type = 9; // 2d (24-bit) mode
+ desc->nrows_lo = (nrows & 0xff);
+ desc->nrows_hi = (nrows >> 8);
+ desc->offset = 0;
+#endif
q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc);
q->tail = desc;
- // FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
+ // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask;
return true;
}
-static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
- dma_ptr dptr,
- size_t dst_row_size,
- size_t src_row_size,
- size_t nrows) {
- return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
-}
-
-
-static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
- dma_ptr dptr,
- size_t dst_row_size,
- size_t src_row_size,
- size_t nrows) {
- return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
-}
-
static inline dma_ptr dma_queue_pop(dma_queue * q) {
dma_ptr dptr = { NULL };
return dptr;
}
- hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
+ dma_descriptor_2d * desc = &q->desc[q->pop_idx];
// Wait for desc to complete
while (1) {
dmpoll();
- if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
+ if (desc->done) {
break;
}
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
return q->capacity;
}
-// ---------------------------------------------------------------------------
-// Overflow-safe DMA push: all UDMA type1 descriptor fields (roiwidth,
-// roiheight, srcstride, dststride) are 16-bit, max 65535. This helper
-// transparently handles values that exceed the 16-bit limit and submits
-// chained DMA transtions.
-//
-// Case 1 (fast path): all params fit in 16 bits -> direct dma_queue_push.
-// Case 2 (contiguous block): width == srcstride == dststride. Reshape the
-// flat transfer into a 2D descriptor with sub_width <= 65535. Produces a
-// single descriptor, preserving async DMA behavior.
-// Case 3 (stride overflow): srcstride or dststride > 65535. Issue rows
-// one at a time. The first N-1 rows are pushed+popped synchronously;
-// the last row is left async so the caller can pop it.
-// ---------------------------------------------------------------------------
-#define UDMA_MAX_FIELD_VAL 65535u
-
-static inline bool dma_queue_push_chained(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t width, size_t nrows) {
- // Fast path: everything fits in 16 bits.
- if (__builtin_expect(
- width <= UDMA_MAX_FIELD_VAL &&
- nrows <= UDMA_MAX_FIELD_VAL &&
- src_stride <= UDMA_MAX_FIELD_VAL &&
- dst_stride <= UDMA_MAX_FIELD_VAL, 1)) {
- return dma_queue_push(q, dptr, dst_stride, src_stride, width, nrows);
- }
+#if __HVX_ARCH__ < 75
- // Case 2: contiguous block (width == src_stride == dst_stride).
- // Reshape total bytes into sub_width * sub_nrows where sub_width <= 65535.
- if (width == src_stride && width == dst_stride) {
- size_t total = width * nrows;
+// Overflow-safe DMA push: all 2d descriptor fields (row_size, nrows, src_stride, dst_stride) are 16-bit, max 65535.
+// This version transparently handles values that exceed the 16-bit limit and submits chained DMA transtions.
- // Pick the largest 128-byte-aligned sub_width that divides total evenly.
- size_t sub_width = UDMA_MAX_FIELD_VAL & ~(size_t)127; // 65408
- while (sub_width > 0 && total % sub_width != 0) {
- sub_width -= 128;
- }
- if (sub_width == 0) {
- // Fallback: use original width (must fit) with adjusted nrows.
- // This shouldn't happen for 128-aligned DMA sizes.
- sub_width = width;
- }
- size_t sub_nrows = total / sub_width;
-
- // Handle sub_nrows > 65535 by issuing chunked descriptors.
- const uint8_t *src = (const uint8_t *)dptr.src;
- uint8_t *dst = (uint8_t *)dptr.dst;
- size_t rows_done = 0;
- while (rows_done < sub_nrows) {
- size_t chunk = sub_nrows - rows_done;
- if (chunk > UDMA_MAX_FIELD_VAL) chunk = UDMA_MAX_FIELD_VAL;
-
- dma_ptr p = dma_make_ptr(dst + rows_done * sub_width, src + rows_done * sub_width);
- if (!dma_queue_push(q, p, sub_width, sub_width, sub_width, chunk))
- return false;
+#define DMA_MAX_FIELD_VAL 65535u
- rows_done += chunk;
- // Complete all chunks without waiting except the last one, so the
- // caller's single dma_queue_pop drains the final descriptor.
- if (rows_done < sub_nrows)
- dma_queue_pop_nowait(q);
- }
- return true;
+static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
+ // Fast path: everything fits in 16 bits
+ if (nrows == 0 || __builtin_expect(
+ row_size <= DMA_MAX_FIELD_VAL &&
+ nrows <= DMA_MAX_FIELD_VAL &&
+ src_stride <= DMA_MAX_FIELD_VAL &&
+ dst_stride <= DMA_MAX_FIELD_VAL, 1)) {
+ return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
}
- // Case 3: stride overflow — fall back to row-by-row.
+ // Contiguous block
+ // Use 1d DMA mode which supports sizes up to 24-bits (16MB)
+ if (nrows == 1 || (row_size == src_stride && row_size == dst_stride)) {
+ size_t total = row_size * nrows;
+ return dma_queue_push_single_1d(q, dptr, total);
+ }
+
+ // Stride overflow — fall back to row-by-row.
{
- const uint8_t *src = (const uint8_t *)dptr.src;
- uint8_t *dst = (uint8_t *)dptr.dst;
+ const uint8_t *src = (const uint8_t *) dptr.src;
+ uint8_t *dst = (uint8_t *) dptr.dst;
for (size_t r = 0; r < nrows; ++r) {
- dma_ptr p = dma_make_ptr(dst + r * dst_stride,
- src + r * src_stride);
- if (!dma_queue_push(q, p, 0, 0, width, 1))
- return false;
- if (r + 1 < nrows)
- dma_queue_pop_nowait(q);
+ dma_ptr p = dma_make_ptr(dst + r * dst_stride, src + r * src_stride);
+ if (!dma_queue_push_single_1d(q, p, row_size))
+ return false;
+ if (r + 1 < nrows)
+ dma_queue_pop(q);
}
return true;
}
}
+#else // HVX_ARCH >= 75
+
+static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
+ // On v75 and up we always use 2d 24-bit mode
+ return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
+}
+
+#endif
+
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
+ return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
+ return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
+}
+
#ifdef __cplusplus
} // extern "C"
#endif