req.op = HTP_OP_UNARY_SILU;
supported = true;
}
- else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
+ else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
req.op = HTP_OP_UNARY_GELU;
supported = true;
}
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
- uint32_t src0_nrows_per_thread) {
+ uint32_t src0_nrows_per_thread,
+ dma_queue * dma_queue) {
htp_act_preamble2;
uint64_t t1, t2;
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
+ const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+ const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
const uint32_t src0_nrows = ne01 * ne02 * ne03;
return;
}
- int is_aligned = 1;
- int opt_path = 0;
- if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
- is_aligned = 0;
- FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
- }
- if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
- opt_path = 1;
+ const uint8_t * data_src0 = (const uint8_t *) src0->data;
+ uint8_t * data_dst = (uint8_t *) dst->data;
+
+ uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+ uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
+
+ // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
+ size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
+ size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
+
+ // In gelu = x*sigmoid(x*1.702)
+ const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+
+ if (BLOCK == 0) {
+ FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
+ src0_spad->size_per_thread, src0_row_size_aligned);
+ return;
}
- const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
- uint8_t * restrict data_dst = (uint8_t *) dst->data;
+ // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+ for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+ const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
- uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
- uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
+ // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+ dma_queue_push_vtcm_to_ddr(dma_queue,
+ dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+ dst_row_size, dst_row_size_aligned, 0);
+
+ dma_queue_push_ddr_to_vtcm(dma_queue,
+ dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
+ src0_row_size_aligned, src0_row_size, block_size);
+ }
- const int BLOCK = 8;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
- const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
+ const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
- // Prefetch next block
- if (block_end < src0_end_row) {
- const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
- htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
- }
+ float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
+ float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
- // Process rows in current block
- for (uint32_t ib = ir; ib < block_end; ib++) {
- const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
- float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
+ for (uint32_t ib = 0; ib < block_size; ib++) {
+ const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
+ float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// gelu = x * sigmoid(1.702 * x) // current implementation
- if (1 == opt_path) {
- hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
- hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
- hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
- } else {
- hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
- hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
- hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
- }
+ hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
+ hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+ hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+ }
+
+ dma_queue_push_vtcm_to_ddr(dma_queue,
+ dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
+ dst_row_size, dst_row_size_aligned, block_size);
+
+ // prefetch N+2 loop iteration if any
+ const uint32_t pref_block = (ir + BLOCK * 2);
+ if (pref_block < src0_end_row) {
+ const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
+ dma_queue_push_ddr_to_vtcm(dma_queue,
+ dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
+ src0_row_size_aligned, src0_row_size, pref_block_size);
}
}
+ dma_queue_flush(dma_queue);
+
t2 = HAP_perf_get_qtimer_count();
- FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+ FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
- octx->src0_nrows_per_thread);
+ octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
const uint32_t n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
- const size_t src0_row_size = src0->nb[1];
- const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
- const size_t dst_row_size = dst->nb[1];
+ size_t src0_row_size = src0->nb[1];
+ size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
+ size_t dst_row_size = dst->nb[1];
+
+ const bool src1_valid = src1->ne[0];
+ if (!src1_valid) {
+ src1_row_size = src0_row_size;
+ }
+ const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+ const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+ const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
- octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
- octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
- octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
- size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+ size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
+ size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
+
+ // Make sure the reserved vtcm size is sufficient
+ if(vtcm_row_per_thread ==0){
+ FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
+ spad_size_per_row * n_threads);
+ return HTP_STATUS_VTCM_TOO_SMALL;
+ }
+
+ octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
+ octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
+ octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
+
+ octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
+ octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
+ octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
+
+ octx->src0_spad.data = octx->ctx->vtcm_base;
+ octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+ octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (src1->ne[0]) {
- FARF(HIGH,
- "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+ FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}
- // Make sure the reserved vtcm size is sufficient
- if (octx->ctx->vtcm_size < spad_size) {
- FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
- spad_size);
- return HTP_STATUS_VTCM_TOO_SMALL;
- }
-
- octx->src0_spad.data = octx->ctx->vtcm_base;
- octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
- octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
-
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
-
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
}
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
- q->dst = (void **) memalign(4, capacity * sizeof(void *));
- memset(q->dst, 0, capacity * sizeof(void *));
+ q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
+ memset(q->dptr, 0, capacity * sizeof(dma_ptr));
q->tail = &q->desc[capacity - 1];
- if (!q->desc && !q->dst) {
+ if (!q->desc && !q->dptr) {
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
return NULL;
}
return;
}
free(q->desc);
- free(q->dst);
+ free(q->dptr);
free(q);
}
void dma_queue_flush(dma_queue * q) {
- while (1) {
- uint32_t s = dmwait() & 0x3;
- if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
- break;
- }
- }
- q->tail = NULL;
+ while (dma_queue_pop(q).dst != NULL) ;
}
extern "C" {
#endif
+typedef struct {
+ void *dst;
+ const void *src;
+} dma_ptr;
+
typedef struct {
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
hexagon_udma_descriptor_type1_t * tail; // tail pointer
- void ** dst; // dst pointers
+ dma_ptr * dptr; // dst/src pointers
uint32_t push_idx;
uint32_t pop_idx;
uint32_t capacity;
return ret;
}
-static inline bool dma_queue_push(dma_queue * q,
- void * dst,
- const void * src,
- size_t dst_row_size,
- size_t src_row_size,
- size_t nrows) {
+static inline dma_ptr dma_make_ptr(void *dst, const void *src)
+{
+ dma_ptr p = { dst, src };
+ return p;
+}
+
+static inline bool dma_queue_push(dma_queue * q,
+ dma_ptr dptr,
+ size_t dst_row_size,
+ size_t src_row_size,
+ size_t width, // width in bytes. number of bytes to transfer per row
+ size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+ FARF(ERROR, "dma-push: queue full\n");
return false;
}
#endif
desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
- desc->src = (void *) src;
- desc->dst = (void *) dst;
+ desc->src = (void *) dptr.src;
+ desc->dst = (void *) dptr.dst;
desc->allocation = 0;
desc->padding = 0;
- desc->roiwidth = src_row_size;
+ desc->roiwidth = width;
desc->roiheight = nrows;
desc->srcstride = src_row_size;
desc->dststride = dst_row_size;
desc->srcwidthoffset = 0;
desc->dstwidthoffset = 0;
- q->dst[q->push_idx] = dst;
+ q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc);
q->tail = desc;
return true;
}
-static inline uint8_t * dma_queue_pop(dma_queue * q) {
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
+ dma_ptr dptr,
+ size_t dst_row_size,
+ size_t src_row_size,
+ size_t nrows) {
+ return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
+ dma_ptr dptr,
+ size_t dst_row_size,
+ size_t src_row_size,
+ size_t nrows) {
+ return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
+}
+
+static inline dma_ptr dma_queue_pop(dma_queue * q) {
+ dma_ptr dptr = { NULL };
+
if (q->push_idx == q->pop_idx) {
- return NULL;
+ return dptr;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
}
- uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
+ dptr = q->dptr[q->pop_idx];
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
- return dst;
+ return dptr;
}
#ifdef __cplusplus
int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32;
- assert(remaining == 0);
-
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
}
-}
+ if (remaining > 0) {
+ const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
+ float * dstf = (float *) dst + step_of_1*VLEN_FP32;
+
+ HVX_Vector in = *(HVX_UVector *) srcf;
+ HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
+ hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
+ }
+}
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) {
- ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
+ // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
+ ctx->dma[i] = dma_queue_create(64);
}
// init worker pool
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const int is0 = (ir0 - src0_start_row);
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row
const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
if (src0_end_row != src0_end_row_x2) {
const uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
}
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
- dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
- const uint8_t * ss0 = dma_queue_pop(dma_queue);
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
}
}