ggml-hexagon: gelu optimization (llama/18151)

author Shouyu <redacted>

Mon, 22 Dec 2025 18:56:52 +0000 (13:56 -0500)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 10:39:43 +0000 (12:39 +0200)
author Shouyu <redacted>
Mon, 22 Dec 2025 18:56:52 +0000 (13:56 -0500)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 10:39:43 +0000 (12:39 +0200)
diff --git a/src/ggml-hexagon/ggml-hexagon.cpp b/src/ggml-hexagon/ggml-hexagon.cpp

index 6a00abacc3913ad53f4370f4e48c15a87ebb48fb..853a5bda1e6a9038f7708a6624749ce81511ad9d 100644 (file)
--- a/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2668,7 +2668,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
                  req.op    = HTP_OP_UNARY_SILU;
                  supported = true;
              }
-            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
+            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
                  req.op    = HTP_OP_UNARY_GELU;
                  supported = true;
              }
diff --git a/src/ggml-hexagon/htp/act-ops.c b/src/ggml-hexagon/htp/act-ops.c

index 586b5c1f9229e4549fa02297c62b158a071ee234..7e488456ee6c37e83913678f9d3a36abfc467531 100644 (file)
--- a/src/ggml-hexagon/htp/act-ops.c
+++ b/src/ggml-hexagon/htp/act-ops.c
@@ -263,7 +263,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                         struct htp_spad *         dst_spad,
                                         uint32_t                  nth,
                                         uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread) {
+                                       uint32_t                  src0_nrows_per_thread,
+                                       dma_queue *               dma_queue) {
      htp_act_preamble2;
  
      uint64_t t1, t2;
@@ -271,6 +272,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
  
      const size_t src0_row_size = nb01;
      const size_t dst_row_size  = nb1;
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
  
      const uint32_t src0_nrows = ne01 * ne02 * ne03;
  
@@ -282,60 +285,81 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
          return;
      }
  
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
-        is_aligned = 0;
-        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+    const uint8_t * data_src0 = (const uint8_t *) src0->data;
+    uint8_t * data_dst        = (uint8_t *) dst->data;
+
+    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+
+    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
+    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
+    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+
+    // In gelu = x*sigmoid(x*1.702)
+    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+
+    if (BLOCK == 0) {
+        FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
+                src0_spad->size_per_thread, src0_row_size_aligned);
+        return;
      }
  
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
  
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+            dst_row_size, dst_row_size_aligned, 0);
+
+        dma_queue_push_ddr_to_vtcm(dma_queue,
+            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
+            src0_row_size_aligned, src0_row_size, block_size);
+    }
  
-    const int BLOCK = 8;
      for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
-        const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
  
-        // Prefetch next block
-        if (block_end < src0_end_row) {
-            const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
-            htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
-        }
+        float* dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
  
-        // Process rows in current block
-        for (uint32_t ib = ir; ib < block_end; ib++) {
-            const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
-            float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
+        for (uint32_t ib = 0; ib < block_size; ib++) {
+            const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
+            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
  
              // gelu = x * sigmoid(1.702 * x) // current implementation
-            if (1 == opt_path) {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
-                hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-                hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            } else {
-                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-                hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-                hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            }
+            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
+            dst_row_size, dst_row_size_aligned, block_size);
+
+        // prefetch N+2 loop iteration if any
+        const uint32_t pref_block = (ir + BLOCK * 2);
+        if (pref_block < src0_end_row) {
+            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
+            dma_queue_push_ddr_to_vtcm(dma_queue,
+                dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
+                src0_row_size_aligned, src0_row_size, pref_block_size);
          }
      }
  
+    dma_queue_flush(dma_queue);
+
      t2 = HAP_perf_get_qtimer_count();
  
-    FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+    FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
           ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
  }
  
  static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
      struct htp_ops_context * octx = (struct htp_ops_context *) data;
      unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread);
+                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
  }
  
  
@@ -468,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
      const uint32_t n_threads  = octx->n_threads;
      const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
  
-    const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
-    const size_t dst_row_size  = dst->nb[1];
+    size_t src0_row_size = src0->nb[1];
+    size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
+    size_t dst_row_size  = dst->nb[1];
+
+    const bool src1_valid = src1->ne[0];
+    if (!src1_valid) {
+        src1_row_size = src0_row_size;
+    }
  
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
      // VTCM scratchpads for all tensors
      // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * octx->n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
  
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    size_t spad_size_per_row   = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
+    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
+
+    // Make sure the reserved vtcm size is sufficient
+    if(vtcm_row_per_thread ==0){
+        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size_per_row * n_threads);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
+    octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread;
+
+    octx->dst_spad.size  = n_threads* octx->dst_spad.size_per_thread;
+    octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
+    octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
  
      if (src1->ne[0]) {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+        FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
               op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
               src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
               octx->dst_spad.size);
@@ -492,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
               octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
      }
  
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
-        return HTP_STATUS_VTCM_TOO_SMALL;
-    }
-
-    octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
-
      if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
          uint32_t n_jobs = MIN(n_threads, src0_nrows);
-
          octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
          worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
      }
diff --git a/src/ggml-hexagon/htp/htp-dma.c b/src/ggml-hexagon/htp/htp-dma.c

index 10c54b45ee239600eac82ae6694fb066c6303f9c..880c4542a0eb5af23a45d9d7ecce47bb08835c29 100644 (file)
--- a/src/ggml-hexagon/htp/htp-dma.c
+++ b/src/ggml-hexagon/htp/htp-dma.c
@@ -34,12 +34,12 @@ dma_queue * dma_queue_create(size_t capacity) {
      q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
      memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
  
-    q->dst = (void **) memalign(4, capacity * sizeof(void *));
-    memset(q->dst, 0, capacity * sizeof(void *));
+    q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
+    memset(q->dptr, 0, capacity * sizeof(dma_ptr));
  
      q->tail = &q->desc[capacity - 1];
  
-    if (!q->desc && !q->dst) {
+    if (!q->desc && !q->dptr) {
          FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
          return NULL;
      }
@@ -54,16 +54,10 @@ void dma_queue_delete(dma_queue * q) {
          return;
      }
      free(q->desc);
-    free(q->dst);
+    free(q->dptr);
      free(q);
  }
  
  void dma_queue_flush(dma_queue * q) {
-    while (1) {
-        uint32_t s = dmwait() & 0x3;
-        if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
-            break;
-        }
-    }
-    q->tail = NULL;
+    while (dma_queue_pop(q).dst != NULL) ;
  }
diff --git a/src/ggml-hexagon/htp/htp-dma.h b/src/ggml-hexagon/htp/htp-dma.h

index 7d3fc4078cc0729f37c7f9e34c265ad3a94eca7c..32fd06e7d4677b30359b507443c881e8fc602f35 100644 (file)
--- a/src/ggml-hexagon/htp/htp-dma.h
+++ b/src/ggml-hexagon/htp/htp-dma.h
@@ -11,10 +11,15 @@
  extern "C" {
  #endif
  
+typedef struct {
+    void *dst;
+    const void *src;
+} dma_ptr;
+
  typedef struct {
      hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
      hexagon_udma_descriptor_type1_t * tail;  // tail pointer
-    void **                           dst;   // dst pointers
+    dma_ptr                         * dptr;  // dst/src pointers
      uint32_t                          push_idx;
      uint32_t                          pop_idx;
      uint32_t                          capacity;
@@ -49,13 +54,20 @@ static inline unsigned int dmwait(void) {
      return ret;
  }
  
-static inline bool dma_queue_push(dma_queue *  q,
-                                  void *       dst,
-                                  const void * src,
-                                  size_t       dst_row_size,
-                                  size_t       src_row_size,
-                                  size_t       nrows) {
+static inline dma_ptr dma_make_ptr(void *dst, const void *src)
+{
+    dma_ptr p = { dst, src };
+    return p;
+}
+
+static inline bool dma_queue_push(dma_queue * q,
+                                  dma_ptr     dptr,
+                                  size_t      dst_row_size,
+                                  size_t      src_row_size,
+                                  size_t      width, // width in bytes. number of bytes to transfer per row
+                                  size_t      nrows) {
      if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+        FARF(ERROR, "dma-push: queue full\n");
          return false;
      }
  
@@ -75,18 +87,18 @@ static inline bool dma_queue_push(dma_queue *  q,
  #endif
      desc->order          = 0;
      desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
-    desc->src            = (void *) src;
-    desc->dst            = (void *) dst;
+    desc->src            = (void *) dptr.src;
+    desc->dst            = (void *) dptr.dst;
      desc->allocation     = 0;
      desc->padding        = 0;
-    desc->roiwidth       = src_row_size;
+    desc->roiwidth       = width;
      desc->roiheight      = nrows;
      desc->srcstride      = src_row_size;
      desc->dststride      = dst_row_size;
      desc->srcwidthoffset = 0;
      desc->dstwidthoffset = 0;
  
-    q->dst[q->push_idx] = dst;
+    q->dptr[q->push_idx] = dptr;
  
      dmlink(q->tail, desc);
      q->tail = desc;
@@ -96,9 +108,28 @@ static inline bool dma_queue_push(dma_queue *  q,
      return true;
  }
  
-static inline uint8_t * dma_queue_pop(dma_queue * q) {
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
+}
+
+static inline dma_ptr dma_queue_pop(dma_queue * q) {
+    dma_ptr dptr  = { NULL };
+
      if (q->push_idx == q->pop_idx) {
-        return NULL;
+        return dptr;
      }
  
      hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
@@ -112,11 +143,11 @@ static inline uint8_t * dma_queue_pop(dma_queue * q) {
          // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
      }
  
-    uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
+    dptr = q->dptr[q->pop_idx];
  
      // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
      q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
-    return dst;
+    return dptr;
  }
  
  #ifdef __cplusplus
diff --git a/src/ggml-hexagon/htp/hvx-utils.h b/src/ggml-hexagon/htp/hvx-utils.h

index 566048297d3f6925d8559110ac87978f540dd650..d2d5d23636689f58949692c26d0a3a2ba34912a7 100644 (file)
--- a/src/ggml-hexagon/htp/hvx-utils.h
+++ b/src/ggml-hexagon/htp/hvx-utils.h
@@ -980,8 +980,6 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
      int step_of_1 = num_elems >> 5;
      int remaining = num_elems - step_of_1 * VLEN_FP32;
  
-    assert(remaining == 0);
-
      const HVX_Vector * restrict v_src = (HVX_Vector *) src;
      HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
  
@@ -996,8 +994,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
      for (int i = 0; i < step_of_1; i++) {
          v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
      }
-}
  
+    if (remaining > 0) {
+        const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
+        float *       dstf = (float *) dst + step_of_1*VLEN_FP32;
+
+        HVX_Vector in  = *(HVX_UVector *) srcf;
+        HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
+        hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
+    }
+}
  
  static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
      int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
diff --git a/src/ggml-hexagon/htp/main.c b/src/ggml-hexagon/htp/main.c

index 656c369d0aaf73759c0764d5d5b1d115ad4ba777..fb5508a560f30b9693d01f9ce37f16cda8aaf43e 100644 (file)
--- a/src/ggml-hexagon/htp/main.c
+++ b/src/ggml-hexagon/htp/main.c
@@ -299,7 +299,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
  
      ctx->n_threads = n_hvx;
      for (int i = 0; i < ctx->n_threads; i++) {
-        ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
+        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
+        ctx->dma[i] = dma_queue_create(64);
      }
  
      // init worker pool
diff --git a/src/ggml-hexagon/htp/matmul-ops.c b/src/ggml-hexagon/htp/matmul-ops.c

index 0c9188244d0eeb82090d14cba2f259961e15a7be..f14523d485cf43ce82e2feaa63ca183a584cf65b 100644 (file)
--- a/src/ggml-hexagon/htp/matmul-ops.c
+++ b/src/ggml-hexagon/htp/matmul-ops.c
@@ -1127,13 +1127,13 @@ static void matmul(struct htp_matmul_type * mt,
          if (is0 >= HTP_SPAD_SRC0_NROWS) {
              break;
          }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                         src0_row_size_padded, src0_row_size, 2);
      }
  
      // Process src0 rows
      for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
  
          #pragma unroll(2)
          for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@@ -1146,7 +1146,7 @@ static void matmul(struct htp_matmul_type * mt,
          const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
          const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
          if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 2);
          }
      }
@@ -1155,9 +1155,9 @@ static void matmul(struct htp_matmul_type * mt,
      if (src0_end_row != src0_end_row_x2) {
          uint32_t  ir0 = src0_end_row_x2;
          const int is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                         src0_row_size_padded, src0_row_size, 1);
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
  
          #pragma unroll(2)
          for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@@ -1229,20 +1229,20 @@ static void matvec(struct htp_matmul_type * mt,
          if (is0 >= HTP_SPAD_SRC0_NROWS) {
              break;
          }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                         src0_row_size_padded, src0_row_size, 2);
      }
  
      // Process src0 rows
      for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
          mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
  
          // Prefetch next (n + spad_nrows) row
          const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
          const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
          if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 2);
          }
      }
@@ -1251,9 +1251,9 @@ static void matvec(struct htp_matmul_type * mt,
      if (src0_end_row != src0_end_row_x2) {
          const uint32_t ir0 = src0_end_row_x2;
          const uint32_t is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                         src0_row_size_padded, src0_row_size, 1);
-        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
          mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
      }
  
@@ -1343,13 +1343,13 @@ static void matmul_id(struct htp_matmul_type * mt,
              if (is0 >= HTP_SPAD_SRC0_NROWS) {
                  break;
              }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 2);
          }
  
          // Process src0 rows
          for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
  
              for (uint32_t cid = 0; cid < cne1; ++cid) {
                  struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@@ -1368,7 +1368,7 @@ static void matmul_id(struct htp_matmul_type * mt,
              const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
              const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
              if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                                 src0_row_size_padded, src0_row_size, 2);
              }
          }
@@ -1377,9 +1377,9 @@ static void matmul_id(struct htp_matmul_type * mt,
          if (src0_end_row != src0_end_row_x2) {
              uint32_t       ir0 = src0_end_row_x2;
              const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 1);
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
  
              for (uint32_t cid = 0; cid < cne1; ++cid) {
                  struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@@ -1467,20 +1467,20 @@ static void matvec_id(struct htp_matmul_type * mt,
              if (is0 >= HTP_SPAD_SRC0_NROWS) {
                  break;
              }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 2);
          }
  
          // Process src0 rows
          for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
              mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
  
              // Prefetch next (n + spad_nrows) row
              const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
              const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
              if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
                                 src0_row_size_padded, src0_row_size, 2);
              }
          }
@@ -1489,9 +1489,9 @@ static void matvec_id(struct htp_matmul_type * mt,
          if (src0_end_row != src0_end_row_x2) {
              uint32_t       ir0 = src0_end_row_x2;
              const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                             src0_row_size_padded, src0_row_size, 1);
-            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
              mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
          }
      }
author	Shouyu <redacted>
	Mon, 22 Dec 2025 18:56:52 +0000 (13:56 -0500)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 10:39:43 +0000 (12:39 +0200)
src/ggml-hexagon/ggml-hexagon.cpp		patch \| blob \| history
src/ggml-hexagon/htp/act-ops.c		patch \| blob \| history
src/ggml-hexagon/htp/htp-dma.c		patch \| blob \| history
src/ggml-hexagon/htp/htp-dma.h		patch \| blob \| history
src/ggml-hexagon/htp/hvx-utils.h		patch \| blob \| history
src/ggml-hexagon/htp/main.c		patch \| blob \| history
src/ggml-hexagon/htp/matmul-ops.c		patch \| blob \| history