From: Max Krasnyansky <redacted>
Date: Fri, 30 Jan 2026 08:28:03 +0000 (+0200)
Subject: hexagon: support for OP_CPY, host buffers now optional (llama/18822)
X-Git-Tag: v0.9.6~69
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c2f1fb20ae37140b8d2d548d473af80fd1d4bfb5;p=pkg%2Fggml%2Fsources%2Fggml

hexagon: support for OP_CPY, host buffers now optional (llama/18822)
---

diff --git a/src/ggml-hexagon/ggml-hexagon.cpp b/src/ggml-hexagon/ggml-hexagon.cpp
index 365a24b4..cf1eb994 100644
--- a/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/src/ggml-hexagon/ggml-hexagon.cpp
@@ -42,12 +42,12 @@
 #include "htp_iface.h"
 
 static size_t opt_ndev         = 1;
-static size_t opt_nhvx         = 0;  // use all
-static int    opt_arch         = 0;  // autodetect
+static size_t opt_nhvx         = 0; // use all
+static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
 static int    opt_profile      = 0;
-static int    opt_hostbuf      = 1;
+static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_experimental = 0;
 
 // Enable all stages by default
@@ -1753,6 +1753,9 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
 }
 
 static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
+    if (!opt_hostbuf) {
+        return ggml_backend_buffer_is_hexagon(b);
+    }
     return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
 }
 
@@ -2302,6 +2305,16 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
     return n_bufs;
 }
 
+static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
+    req->op = HTP_OP_CPY;
+
+    size_t n_bufs = 0;
+    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
+
+    return n_bufs;
+}
+
 static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
     req->op = HTP_OP_GET_ROWS;
 
@@ -2557,6 +2570,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
                 break;
 
+            case GGML_OP_CPY:
+                ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
+                break;
+
             default:
                 GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
         }
@@ -2858,6 +2875,27 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
     return true;
 }
 
+static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
+
+    // for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
+    if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
+    if ( dst->type != GGML_TYPE_F32 &&  dst->type != GGML_TYPE_F16) return false;
+
+    const bool sametype   = (src0->type == dst->type);
+    const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
+    const bool sameshape  = !transposed && ggml_are_same_shape(src0, dst);
+
+    // can handle any shape and any same-type (pretty slow if reshaping is required)
+    if (sametype) return true;
+
+    // cannot handle re-shaping and type conversion at the same time
+    if (!sameshape) return false;
+
+    return true;
+}
+
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
 
@@ -2936,6 +2974,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_get_rows(sess, op);
             break;
 
+        case GGML_OP_CPY:
+            supp = ggml_hexagon_supported_cpy(sess, op);
+            break;
+
         default:
             break;
     }
@@ -3061,7 +3103,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
 }
 
 static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
         ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
         return (void *) fct;
     }
@@ -3078,34 +3120,31 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
                   "please update hexagon_type to match ggml_type");
 
+    const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
     const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
     const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-
+    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
+    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
+    const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
+    const char * str_etm     = getenv("GGML_HEXAGON_ETM");
+    const char * str_nhvx    = getenv("GGML_HEXAGON_NHVX");
+    const char * str_ndev    = getenv("GGML_HEXAGON_NDEV");
+    const char * str_arch    = getenv("GGML_HEXAGON_ARCH");
+
+    opt_experimental = str_experimental ? atoi(str_experimental) : 0;
     opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_profile      = getenv("GGML_HEXAGON_PROFILE") != nullptr;
-    opt_etm          = getenv("GGML_HEXAGON_ETM") != nullptr;
-    opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
-
-    const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
-    if (str_opmask != nullptr) {
-        opt_opmask = strtoul(str_opmask, NULL, 0);
-    }
-    opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
+    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0) : opt_opmask;
+    opt_opsync       = str_opsync  ? atoi(str_opsync)  : 0;
+    opt_profile      = str_profile ? atoi(str_profile) : 0;
+    opt_etm          = str_etm     ? atoi(str_etm) : 0;
+    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
+    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
 
-    const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
-    if (str_ndev) {
-        opt_ndev = strtoul(str_ndev, NULL, 0);
-        if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
-            opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
-        }
+    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
+        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
     }
 
-    const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
-    if (str_nhvx) {
-        opt_nhvx = strtoul(str_nhvx, NULL, 0);
-    }
-
-    const char * str_arch = getenv("GGML_HEXAGON_ARCH");
     if (str_arch) {
         if (str_arch[0] == 'v') {
             str_arch++;
@@ -3113,8 +3152,6 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
         opt_arch = strtoul(str_arch, NULL, 0);
     }
 
-    opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
-
     reg->context = new ggml_hexagon_registry(reg);
 
     HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
diff --git a/src/ggml-hexagon/htp/CMakeLists.txt b/src/ggml-hexagon/htp/CMakeLists.txt
index 6a34a215..e8ef2030 100644
--- a/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/src/ggml-hexagon/htp/CMakeLists.txt
@@ -17,11 +17,7 @@ add_library(${HTP_LIB} SHARED
     main.c
     htp_iface_skel.c
     worker-pool.c
-    htp-dma.c
-    hvx-sigmoid.c
-    hvx-inverse.c
-    hvx-exp.c
-    hvx-utils.c
+    hex-dma.c
     matmul-ops.c
     binary-ops.c
     unary-ops.c
@@ -31,10 +27,12 @@ add_library(${HTP_LIB} SHARED
     flash-attn-ops.c
     set-rows-ops.c
     get-rows-ops.c
+    cpy-ops.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
     $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
     FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
 
 build_idl(htp_iface.idl ${HTP_LIB})
diff --git a/src/ggml-hexagon/htp/act-ops.c b/src/ggml-hexagon/htp/act-ops.c
index 88bd2ddc..c3daf5ad 100644
--- a/src/ggml-hexagon/htp/act-ops.c
+++ b/src/ggml-hexagon/htp/act-ops.c
@@ -2,27 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define htp_act_preamble3              \
     const uint32_t ne00 = src0->ne[0]; \
@@ -76,7 +69,7 @@
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
+static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
                                        const struct htp_tensor * src1,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
@@ -124,9 +117,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
         data_src1 += swapped ? 0 : nc_in_bytes;
     }
 
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
     uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
     uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
@@ -175,9 +168,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
             float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
 
             //swiglu(x) = x1 * sigmoid(x0)
-            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
-            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
-                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, nc);
+            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
+                                (const uint8_t *) src1_spad_ptr, nc);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
@@ -203,7 +196,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
+static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
                                            const struct htp_tensor * src1,
                                            struct htp_tensor *       dst,
                                            const int32_t *           op_params,
@@ -249,9 +242,9 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
         data_src1 += swapped ? 0 : nc_in_bytes;
     }
 
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
     uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
     uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
@@ -304,18 +297,18 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
             float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
 
             // x (src0_spad_data) = std::min(src0_p[k], limit);
-            hvx_min_scalar_f32((const uint8_t *) src0_spad_ptr, limit, (uint8_t *) src0_spad_ptr, nc);
+            hvx_min_scalar_f32((uint8_t *) src0_spad_ptr, (const uint8_t *) src0_spad_ptr, limit, nc);
             // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
-            hvx_clamp_scalar_f32((const uint8_t *) src1_spad_ptr, -limit, limit, (uint8_t *) src1_spad_ptr, nc);
+            hvx_clamp_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, -limit, limit, nc);
             // y (src1_spad_data)  = y1 + 1.f
-            hvx_add_scalar_f32((const uint8_t *) src1_spad_ptr, 1.0, (uint8_t *) src1_spad_ptr, nc);
+            hvx_add_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, 1.0, nc);
             // x1 (dst_spad_data) = alpha * (x)
-            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, alpha, (uint8_t *) dst_spad_ptr, nc);
+            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, alpha, nc);
             // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
-            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc);
             // out = x * sigmoid(alpha * x) * (y + 1.f)
-            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
-                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
+                                (const uint8_t *) src1_spad_ptr, nc);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
@@ -342,7 +335,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
 }
 
 
-static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
+static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
                                        struct htp_spad *         src0_spad,
@@ -358,8 +351,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
 
@@ -415,9 +408,9 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // gelu = x * sigmoid(1.702 * x) // current implementation
-            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
-            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
-            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -442,15 +435,15 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
          ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
+static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+    unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
                                octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }
 
 
 
-static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
+static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
                                        struct htp_spad *         src0_spad,
@@ -466,8 +459,8 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
 
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
 
@@ -522,8 +515,8 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // silu = x * sigmoid(x)
-            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
-            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
+            hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -548,25 +541,25 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
          ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
+static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+    unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
                                octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }
 
-static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
+static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+    glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                                &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }
 
-static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
+static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+    glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                                    &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }
 
-static int execute_op_activations_fp32(struct htp_ops_context * octx) {
+static int execute_op_activations_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
     const struct htp_tensor * src0 = &octx->src0;
@@ -583,21 +576,21 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
 
     switch (octx->op) {
         case HTP_OP_UNARY_SILU:
-            act_op_func = unary_silu_fp32;
+            act_op_func = unary_silu_f32;
             op_type     = "silu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU:
-            act_op_func = glu_swiglu_fp32;
+            act_op_func = glu_swiglu_f32;
             op_type     = "swiglu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU_OAI:
-            act_op_func = glu_swiglu_oai_fp32;
+            act_op_func = glu_swiglu_oai_f32;
             op_type     = "swiglu-oai-f32";
             break;
         case HTP_OP_UNARY_GELU:
-            act_op_func = unary_gelu_fp32;
+            act_op_func = unary_gelu_f32;
             op_type     = "gelu-f32";
             break;
         default:
@@ -617,9 +610,9 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
         src1_row_size = src0_row_size;
     }
 
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
 
@@ -670,7 +663,7 @@ int op_activations(struct htp_ops_context * octx) {
 
     switch (octx->src0.type) {
         case HTP_TYPE_F32:
-            err = execute_op_activations_fp32(octx);
+            err = execute_op_activations_f32(octx);
             break;
 
         default:
diff --git a/src/ggml-hexagon/htp/binary-ops.c b/src/ggml-hexagon/htp/binary-ops.c
index 8ed7f67d..de22afe4 100644
--- a/src/ggml-hexagon/htp/binary-ops.c
+++ b/src/ggml-hexagon/htp/binary-ops.c
@@ -2,36 +2,25 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
-
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
-typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
-                                      const uint8_t * src1,
-                                      uint8_t *       data_dst,
-                                      const int       num_elems);
+typedef void (*hvx_elemwise_f32_func)(uint8_t * data_dst, const uint8_t * src0, const uint8_t * src1, const uint32_t num_elems);
 
 static hvx_elemwise_f32_func func_table_HVX[]     = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
-static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };
+static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_aa, hvx_add_f32_aa, hvx_sub_f32_aa };
 
 #define htp_binary_preamble            \
     const struct htp_tensor * src0 = &octx->src0; \
@@ -98,9 +87,8 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
 
     int is_aligned = 1;
     int opt_path   = 0;
-    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
         is_aligned = 0;
     }
     if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
@@ -130,24 +118,24 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
         const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 
         if (ir + 1 < src0_end_row) {
-            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
+            hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1);
             if (src1_row_size == src0_row_size) {
-                htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
+                hex_l2fetch(src1_ptr, src1_row_size, src1_row_size, 1);
             }
         }
 
         const uint32_t nr0 = ne00 / ne10;
         if (nr0 > 1) {
             if ((1 == is_aligned) && (nr0 == ne00)) {
-                hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
+                hvx_splat_f32_a(spad_data_th, *(float *) src1_ptr, nr0);
             } else {
                 for (uint32_t r = 0; r < nr0; r++) {
                     memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
                 }
             }
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
+            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, ne00);
         } else {
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00);
         }
 
         src0_ptr += src0_row_size;
@@ -185,11 +173,6 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
-    }
-
     const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
     uint8_t * restrict data_dst        = (uint8_t *) dst->data;
@@ -210,9 +193,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
         const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);
 
         if (ir + 1 < src0_end_row) {
-            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
+            hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1);
             if (src1_row_size == src0_row_size) {
-                htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
+                hex_l2fetch(src1_ptr + ne10, src1_row_size, src1_row_size, 1);
             }
         }
 
@@ -221,9 +204,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
             for (uint32_t r = 0; r < nr0; r++) {
                 memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
             }
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
+            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data, ne00);
         } else {
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00);
         }
     }
 
@@ -299,9 +282,9 @@ static int execute_op_binary_f32(struct htp_ops_context * octx) {
     const size_t dst_row_size  = dst->nb[1];
 
     // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
diff --git a/src/ggml-hexagon/htp/cpy-ops.c b/src/ggml-hexagon/htp/cpy-ops.c
new file mode 100644
index 00000000..559ca183
--- /dev/null
+++ b/src/ggml-hexagon/htp/cpy-ops.c
@@ -0,0 +1,251 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+
+struct htp_copy_context {
+    struct htp_ops_context * octx;
+
+    uint32_t          src0_type_size;
+    uint32_t          src0_block_size;
+
+    uint32_t          dst_type_size;
+    uint32_t          dst_block_size;
+
+    uint32_t          src0_blocks_per_row;
+    uint32_t          dst_blocks_per_row;
+
+    uint32_t          src0_nrows_per_thread;
+
+    void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
+};
+
+#define cpy_preamble                       \
+    struct htp_tensor *src0 = &octx->src0; \
+    struct htp_tensor *dst  = &octx->dst;  \
+                                           \
+    const uint32_t ne00 = src0->ne[0];     \
+    const uint32_t ne01 = src0->ne[1];     \
+    const uint32_t ne02 = src0->ne[2];     \
+    const uint32_t ne03 = src0->ne[3];     \
+                                           \
+    const uint32_t nb00 = src0->nb[0];     \
+    const uint32_t nb01 = src0->nb[1];     \
+    const uint32_t nb02 = src0->nb[2];     \
+    const uint32_t nb03 = src0->nb[3];     \
+                                           \
+    const uint32_t  ne0 = dst->ne[0];      \
+    const uint32_t  ne1 = dst->ne[1];      \
+    const uint32_t  ne2 = dst->ne[2];      \
+    const uint32_t  ne3 = dst->ne[3];      \
+                                           \
+    const uint32_t  nb0 = dst->nb[0];      \
+    const uint32_t  nb1 = dst->nb[1];      \
+    const uint32_t  nb2 = dst->nb[2];      \
+    const uint32_t  nb3 = dst->nb[3];      \
+                                           \
+    const uint32_t   nr = ne01;
+
+static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+    cpy_preamble;
+
+    // parallelize by src0 rows
+    const uint32_t dr  = ct->src0_nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+
+    // copy by rows
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {
+            #pragma unroll(2)
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
+                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
+                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
+                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
+            }
+        }
+    }
+}
+
+static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
+    cpy_preamble;
+
+    // parallelize by src0 rows
+    const uint32_t dr  = ct->src0_nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+
+    // dst counters
+    int64_t k10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    // number of blocks in a row
+    const int64_t nk00 = ct->src0_blocks_per_row;
+    const int64_t nk0  = ct->dst_blocks_per_row;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            k10 += nk00 * ir0;
+            while (k10 >= nk0) {
+                k10 -= nk0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                for (int64_t k00 = 0; k00 < nk00; k00++) {
+                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+                    memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
+
+                    if (++k10 == nk0) {
+                        k10 = 0;
+                        if (++i11 == ne1) {
+                            i11 = 0;
+                            if (++i12 == ne2) {
+                                i12 = 0;
+                                if (++i13 == ne3) {
+                                    i13 = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            k10 += nk00 * (ne01 - ir1);
+            while (k10 >= nk0) {
+                k10 -= nk0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+    cpy_preamble;
+
+    // parallelize by src0 rows
+    const uint32_t dr  = ct->src0_nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+
+    // copy by rows
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {
+            #pragma unroll(2)
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
+                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
+                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                hex_l2fetch(src0_ptr, ne00 * sizeof(float), nb01, 2);
+                hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
+            }
+        }
+    }
+}
+
+static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+    cpy_preamble;
+
+    // parallelize by src0 rows
+    const uint32_t dr  = ct->src0_nrows_per_thread;
+    const uint32_t ir0 = dr * ith;
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+
+    // copy by rows
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {
+            #pragma unroll(2)
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
+                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
+                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                hex_l2fetch(src0_ptr, ne00 * sizeof(__fp16), nb01, 2);
+                hvx_copy_f32_f16_uu(dst_ptr, src0_ptr, ne00);
+            }
+        }
+    }
+}
+
+static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
+    struct htp_copy_context *ct = (struct htp_copy_context *) data;
+    ct->copy(ct, ct->octx, n, i);
+}
+
+int op_cpy(struct htp_ops_context * octx) {
+    cpy_preamble;
+
+    struct htp_copy_context ct;
+    ct.octx = octx;
+
+    switch (src0->type) {
+    case HTP_TYPE_F32: ct.src0_type_size = 4; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break;
+    case HTP_TYPE_F16: ct.src0_type_size = 2; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break;
+    default:
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    switch (dst->type) {
+    case HTP_TYPE_F32: ct.dst_type_size = 4; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break;
+    case HTP_TYPE_F16: ct.dst_type_size = 2; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break;
+    default:
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    const bool sametype   = (src0->type == dst->type);
+    const bool transposed = (nb00 > nb01) || (nb0 > nb1);
+    const bool sameshape  = !transposed && (ne00 == ne0 && ne01 == ne1 && ne02 == ne2 && ne03 == ne3);
+
+    const uint32_t n_jobs = MIN(nr, octx->n_threads);
+    ct.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+
+    if (sametype && sameshape) {
+        ct.copy = cpy_thread_sametype_sameshape;
+    } else if (sameshape) {
+        /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
+            ct.copy = cpy_thread_f16_f32_sameshape;
+        else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
+            ct.copy = cpy_thread_f32_f16_sameshape;
+        else
+            return HTP_STATUS_NO_SUPPORT;
+    } else if (sametype) {
+        ct.copy = cpy_thread_sametype_reshape;
+    } else {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_jobs);
+
+    return HTP_STATUS_OK;
+}
diff --git a/src/ggml-hexagon/htp/flash-attn-ops.c b/src/ggml-hexagon/htp/flash-attn-ops.c
index 04a7b843..1de47d0f 100644
--- a/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -2,25 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 // Dot product of FP32 and FP16 vectors, accumulating to float
 static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
@@ -70,8 +65,8 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
+    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
 
     hvx_vec_store_u(r, 4, rsum);
 }
@@ -111,8 +106,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
+    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
     hvx_vec_store_u(r, 4, rsum);
 }
 
@@ -124,7 +119,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict
     uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     uint32_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector S = hvx_vec_splat_fp16(s);
+    HVX_Vector S = hvx_vec_splat_f16(s);
 
     uint32_t i = 0;
     #pragma unroll(4)
@@ -148,7 +143,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict
 
         if (nloe) {
             HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            hvx_vec_store_u(&ptr_y[i], nloe * 4, xy);
+            hvx_vec_store_a(&ptr_y[i], nloe * 4, xy);
         }
     }
 }
@@ -225,18 +220,18 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
     const uint32_t DV = nev0;
 
     const size_t size_q_row = DK * ((q->type == HTP_TYPE_F32) ? 4 : 2);
-    const size_t size_q_row_padded = htp_round_up(size_q_row, 128);
+    const size_t size_q_row_padded = hex_round_up(size_q_row, 128);
 
     const size_t size_k_row = DK * sizeof(__fp16);
     const size_t size_v_row = DV * sizeof(__fp16);
     const size_t size_m_row = FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16); // Treat block as one row for mask
 
-    const size_t size_k_row_padded = htp_round_up(size_k_row, 128);
-    const size_t size_v_row_padded = htp_round_up(size_v_row, 128);
+    const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
+    const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
 
     const size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
     const size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
-    const size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
+    const size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
 
     // Scratchpad buffers for Q, K, V, Mask, and VKQ32 accumulator
     uint8_t * spad_q = octx->src0_spad.data + octx->src0_spad.size_per_thread * ith;
@@ -272,8 +267,8 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
         float M = -INFINITY; // maximum KQ value
 
         // Clear accumulator
+        hvx_splat_f32_a(spad_a, 0, DV);
         float * VKQ32 = (float *) spad_a;
-        memset(VKQ32, 0, DV * sizeof(float));
 
         const __fp16 * mp_base = NULL;
         if (mask) {
@@ -340,30 +335,30 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
 
                 // 2. Softcap
                 if (logit_softcap != 0.0f) {
-                    scores = hvx_vec_tanh_fp32(scores);
-                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_fp32(logit_softcap));
+                    scores = hvx_vec_tanh_f32(scores);
+                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_f32(logit_softcap));
                     scores = Q6_Vsf_equals_Vqf32(scores);
                 }
 
                 // 3. Mask
                 if (mask) {
                     const __fp16 * mp = m_base + ic;
-                    HVX_Vector m_vals_fp16 = *(const HVX_UVector *) mp;
+                    HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
 
-                    HVX_Vector one_fp16 = Q6_Vh_vsplat_R(0x3c00);
-                    HVX_VectorPair m_vals_fp32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_fp16), one_fp16);
+                    HVX_Vector one_f16 = Q6_Vh_vsplat_R(0x3c00);
+                    HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), one_f16);
 
-                    HVX_Vector m_vals_fp32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_fp32_pair));
+                    HVX_Vector m_vals_f32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_f32_pair));
 
-                    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
-                    HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_fp32, slope_vec);
+                    HVX_Vector slope_vec = hvx_vec_splat_f32(slope);
+                    HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_f32, slope_vec);
                     scores = Q6_Vqf32_vadd_VsfVsf(scores, Q6_Vsf_equals_Vqf32(add_val));
                     scores = Q6_Vsf_equals_Vqf32(scores);
                 }
 
                 // 4. Online Softmax Update
-                HVX_Vector v_max = hvx_vec_reduce_max_fp32(scores);
-                float m_block = hvx_vec_get_fp32(v_max);
+                HVX_Vector v_max = hvx_vec_reduce_max_f32(scores);
+                float m_block = hvx_vec_get_f32(v_max);
 
                 float M_old = M;
                 float M_new = (m_block > M) ? m_block : M;
@@ -374,12 +369,12 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
                 hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
                 S = S * ms;
 
-                HVX_Vector M_new_vec = hvx_vec_splat_fp32(M_new);
+                HVX_Vector M_new_vec = hvx_vec_splat_f32(M_new);
                 HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_new_vec);
-                HVX_Vector P = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(scores_shifted));
+                HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
 
-                HVX_Vector p_sum_vec = hvx_vec_fp32_reduce_sum(P);
-                float p_sum = hvx_vec_get_fp32(p_sum_vec);
+                HVX_Vector p_sum_vec = hvx_vec_reduce_sum_f32(P);
+                float p_sum = hvx_vec_get_f32(p_sum_vec);
                 S += p_sum;
 
                 // 5. Accumulate V
@@ -484,9 +479,9 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
         uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
 
         if (dst->type == HTP_TYPE_F32) {
-            hvx_copy_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
+            hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
         } else if (dst->type == HTP_TYPE_F16) {
-            hvx_copy_fp16_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
+            hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
         }
     }
 }
@@ -523,16 +518,16 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
         octx->src3_div3 = init_fastdiv_values(mask->ne[3]);
     }
 
-    size_t size_q_row_padded = htp_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
-    size_t size_k_row_padded = htp_round_up(k->ne[0] * sizeof(__fp16), 128);
-    size_t size_v_row_padded = htp_round_up(v->ne[0] * sizeof(__fp16), 128);
+    size_t size_q_row_padded = hex_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
+    size_t size_k_row_padded = hex_round_up(k->ne[0] * sizeof(__fp16), 128);
+    size_t size_v_row_padded = hex_round_up(v->ne[0] * sizeof(__fp16), 128);
 
     size_t size_q_block = size_q_row_padded * 1; // single row for now
     size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
     size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
-    size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
+    size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
 
-    size_t size_vkq_acc = htp_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
+    size_t size_vkq_acc = hex_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
 
     octx->src0_spad.size_per_thread = size_q_block * 1;
     octx->src1_spad.size_per_thread = size_k_block * 2;
diff --git a/src/ggml-hexagon/htp/get-rows-ops.c b/src/ggml-hexagon/htp/get-rows-ops.c
index 54321421..a657cd2d 100644
--- a/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/src/ggml-hexagon/htp/get-rows-ops.c
@@ -2,14 +2,9 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
 #include <string.h>
 
@@ -19,7 +14,6 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define get_rows_preamble \
     const uint32_t ne00 = octx->src0.ne[0]; \
@@ -72,7 +66,7 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
 
         const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03;
         const uintptr_t dst_ptr  = octx->dst.data  + i10*nb1  + i11*nb2  + i12*nb3;
-        hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
+        hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
     }
 
     return HTP_STATUS_OK;
diff --git a/src/ggml-hexagon/htp/hex-dma.c b/src/ggml-hexagon/htp/hex-dma.c
new file mode 100644
index 00000000..44e1be40
--- /dev/null
+++ b/src/ggml-hexagon/htp/hex-dma.c
@@ -0,0 +1,63 @@
+#include "hex-dma.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#pragma clang diagnostic ignored "-Wunused-function"
+
+static inline uint32_t pow2_ceil(uint32_t x) {
+    if (x <= 1) {
+        return 1;
+    }
+    int p = 2;
+    x--;
+    while (x >>= 1) {
+        p <<= 1;
+    }
+    return p;
+}
+
+dma_queue * dma_queue_create(size_t capacity) {
+    dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
+    if (q == NULL) {
+        FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
+        return NULL;
+    }
+
+    capacity = pow2_ceil(capacity);
+
+    memset(q, 0, sizeof(dma_queue));
+    q->capacity = capacity;
+    q->idx_mask = capacity - 1;
+
+    q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
+    memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
+
+    q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
+    memset(q->dptr, 0, capacity * sizeof(dma_ptr));
+
+    q->tail = &q->desc[capacity - 1];
+
+    if (!q->desc && !q->dptr) {
+        FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
+        return NULL;
+    }
+
+    FARF(HIGH, "dma-queue: capacity %u\n", capacity);
+
+    return q;
+}
+
+void dma_queue_delete(dma_queue * q) {
+    if (!q) {
+        return;
+    }
+    free(q->desc);
+    free(q->dptr);
+    free(q);
+}
+
+void dma_queue_flush(dma_queue * q) {
+    while (dma_queue_pop(q).dst != NULL) ;
+}
diff --git a/src/ggml-hexagon/htp/hex-dma.h b/src/ggml-hexagon/htp/hex-dma.h
new file mode 100644
index 00000000..d1ddb0ec
--- /dev/null
+++ b/src/ggml-hexagon/htp/hex-dma.h
@@ -0,0 +1,156 @@
+#ifndef HTP_DMA_H
+#define HTP_DMA_H
+
+#include <HAP_farf.h>
+#include <hexagon_types.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    void *dst;
+    const void *src;
+} dma_ptr;
+
+typedef struct {
+    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
+    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
+    dma_ptr                         * dptr;  // dst/src pointers
+    uint32_t                          push_idx;
+    uint32_t                          pop_idx;
+    uint32_t                          capacity;
+    uint32_t                          idx_mask;
+} dma_queue;
+
+dma_queue * dma_queue_create(size_t capacity);
+void        dma_queue_delete(dma_queue * q);
+void        dma_queue_flush(dma_queue * q);
+
+// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
+// but those do not seem to always compiler properly.
+static inline void dmstart(void * next) {
+    asm volatile(" release(%0):at" : : "r"(next));
+    asm volatile(" dmstart(%0)" : : "r"(next));
+}
+
+static inline void dmlink(void * cur, void * next) {
+    asm volatile(" release(%0):at" : : "r"(next));
+    asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
+}
+
+static inline unsigned int dmpoll(void) {
+    unsigned int ret = 0;
+    asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
+    return ret;
+}
+
+static inline unsigned int dmwait(void) {
+    unsigned int ret = 0;
+    asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
+    return ret;
+}
+
+static inline dma_ptr dma_make_ptr(void *dst, const void *src)
+{
+    dma_ptr p = { dst, src };
+    return p;
+}
+
+static inline bool dma_queue_push(dma_queue * q,
+                                  dma_ptr     dptr,
+                                  size_t      dst_row_size,
+                                  size_t      src_row_size,
+                                  size_t      width, // width in bytes. number of bytes to transfer per row
+                                  size_t      nrows) {
+    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+        FARF(ERROR, "dma-push: queue full\n");
+        return false;
+    }
+
+    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
+
+    desc->next           = NULL;
+    desc->length         = 0;
+    desc->desctype       = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
+    desc->dstbypass      = 1;
+    desc->srcbypass      = 1;
+#if __HVX_ARCH__ >= 73
+    desc->dstbypass      = 1;
+    desc->srcbypass      = 1;
+#else
+    desc->dstbypass      = 0;
+    desc->srcbypass      = 1;
+#endif
+    desc->order          = 0;
+    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
+    desc->src            = (void *) dptr.src;
+    desc->dst            = (void *) dptr.dst;
+    desc->allocation     = 0;
+    desc->padding        = 0;
+    desc->roiwidth       = width;
+    desc->roiheight      = nrows;
+    desc->srcstride      = src_row_size;
+    desc->dststride      = dst_row_size;
+    desc->srcwidthoffset = 0;
+    desc->dstwidthoffset = 0;
+
+    q->dptr[q->push_idx] = dptr;
+
+    dmlink(q->tail, desc);
+    q->tail = desc;
+
+    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
+    q->push_idx = (q->push_idx + 1) & q->idx_mask;
+    return true;
+}
+
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
+                                              dma_ptr     dptr,
+                                              size_t      dst_row_size,
+                                              size_t      src_row_size,
+                                              size_t      nrows) {
+    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
+}
+
+static inline dma_ptr dma_queue_pop(dma_queue * q) {
+    dma_ptr dptr  = { NULL };
+
+    if (q->push_idx == q->pop_idx) {
+        return dptr;
+    }
+
+    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
+
+    // Wait for desc to complete
+    while (1) {
+        dmpoll();
+        if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
+            break;
+        }
+        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
+    }
+
+    dptr = q->dptr[q->pop_idx];
+
+    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
+    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
+    return dptr;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* HTP_DMA_H */
diff --git a/src/ggml-hexagon/htp/hex-dump.h b/src/ggml-hexagon/htp/hex-dump.h
new file mode 100644
index 00000000..e3badb57
--- /dev/null
+++ b/src/ggml-hexagon/htp/hex-dump.h
@@ -0,0 +1,77 @@
+#ifndef HEX_DUMP_H
+#define HEX_DUMP_H
+
+#include <HAP_farf.h>
+
+static inline void hex_dump_int8_line(char * pref, const int8_t * x, int n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void hex_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void hex_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void hex_dump_f16_line(char * pref, const __fp16 * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void hex_dump_f32_line(char * pref, const float * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void hex_dump_f32(char * pref, const float * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        hex_dump_f32_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        hex_dump_f32_line(pref, x + (16 * i), n1);
+    }
+}
+
+static inline void hex_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        hex_dump_f16_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        hex_dump_f16_line(pref, x + (16 * i), n1);
+    }
+}
+
+#endif /* HEX_DUMP_H */
diff --git a/src/ggml-hexagon/htp/hex-fastdiv.h b/src/ggml-hexagon/htp/hex-fastdiv.h
new file mode 100644
index 00000000..b7b58675
--- /dev/null
+++ b/src/ggml-hexagon/htp/hex-fastdiv.h
@@ -0,0 +1,37 @@
+#ifndef HEX_FASTDIV_H
+#define HEX_FASTDIV_H
+
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+struct fastdiv_values {
+    uint32_t mp;
+    uint32_t l;
+};
+
+static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
+    struct fastdiv_values result = { 0, 0 };
+    // compute L = ceil(log2(d));
+    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
+        ++(result.l);
+    }
+
+    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
+    return result;
+}
+
+static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
+    // Compute high 32 bits of n * mp
+    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
+    // add n, apply bit shift
+    return (hi + n) >> vals->l;
+}
+
+static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
+    return n - fastdiv(n, vals) * d;
+}
+
+#endif /* HEX_FASTDIV_H */
diff --git a/src/ggml-hexagon/htp/hex-utils.h b/src/ggml-hexagon/htp/hex-utils.h
new file mode 100644
index 00000000..fb8a25a3
--- /dev/null
+++ b/src/ggml-hexagon/htp/hex-utils.h
@@ -0,0 +1,51 @@
+#ifndef HEX_UTILS_H
+#define HEX_UTILS_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hexagon_types.h"
+
+#include "hex-fastdiv.h"
+#include "hex-dump.h"
+
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+static inline uint64_t hex_get_cycles() {
+    uint64_t cycles = 0;
+    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
+    return cycles;
+}
+
+static inline uint64_t hex_get_pktcnt() {
+    uint64_t pktcnt;
+    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
+    return pktcnt;
+}
+
+static inline int32_t hex_is_aligned(void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
+    return m * ((n + m - 1) / m);
+}
+
+static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
+    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
+    Q6_l2fetch_AP((void *) p, control);
+}
+
+#endif /* HEX_UTILS_H */
diff --git a/src/ggml-hexagon/htp/htp-ctx.h b/src/ggml-hexagon/htp/htp-ctx.h
index 4bd0ea7a..a707d982 100644
--- a/src/ggml-hexagon/htp/htp-ctx.h
+++ b/src/ggml-hexagon/htp/htp-ctx.h
@@ -1,7 +1,7 @@
 #ifndef HTP_CTX_H
 #define HTP_CTX_H
 
-#include "htp-dma.h"
+#include "hex-dma.h"
 #include "worker-pool.h"
 
 #include <assert.h>
diff --git a/src/ggml-hexagon/htp/htp-dma.c b/src/ggml-hexagon/htp/htp-dma.c
deleted file mode 100644
index 880c4542..00000000
--- a/src/ggml-hexagon/htp/htp-dma.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "htp-dma.h"
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#pragma clang diagnostic ignored "-Wunused-function"
-
-static inline uint32_t pow2_ceil(uint32_t x) {
-    if (x <= 1) {
-        return 1;
-    }
-    int p = 2;
-    x--;
-    while (x >>= 1) {
-        p <<= 1;
-    }
-    return p;
-}
-
-dma_queue * dma_queue_create(size_t capacity) {
-    dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
-    if (q == NULL) {
-        FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
-        return NULL;
-    }
-
-    capacity = pow2_ceil(capacity);
-
-    memset(q, 0, sizeof(dma_queue));
-    q->capacity = capacity;
-    q->idx_mask = capacity - 1;
-
-    q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
-    memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
-
-    q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
-    memset(q->dptr, 0, capacity * sizeof(dma_ptr));
-
-    q->tail = &q->desc[capacity - 1];
-
-    if (!q->desc && !q->dptr) {
-        FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
-        return NULL;
-    }
-
-    FARF(HIGH, "dma-queue: capacity %u\n", capacity);
-
-    return q;
-}
-
-void dma_queue_delete(dma_queue * q) {
-    if (!q) {
-        return;
-    }
-    free(q->desc);
-    free(q->dptr);
-    free(q);
-}
-
-void dma_queue_flush(dma_queue * q) {
-    while (dma_queue_pop(q).dst != NULL) ;
-}
diff --git a/src/ggml-hexagon/htp/htp-dma.h b/src/ggml-hexagon/htp/htp-dma.h
deleted file mode 100644
index 32fd06e7..00000000
--- a/src/ggml-hexagon/htp/htp-dma.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef HTP_DMA_H
-#define HTP_DMA_H
-
-#include <HAP_farf.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
-#include <stdbool.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-    void *dst;
-    const void *src;
-} dma_ptr;
-
-typedef struct {
-    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
-    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
-    dma_ptr                         * dptr;  // dst/src pointers
-    uint32_t                          push_idx;
-    uint32_t                          pop_idx;
-    uint32_t                          capacity;
-    uint32_t                          idx_mask;
-} dma_queue;
-
-dma_queue * dma_queue_create(size_t capacity);
-void        dma_queue_delete(dma_queue * q);
-void        dma_queue_flush(dma_queue * q);
-
-// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
-// but those do not seem to always compiler properly.
-static inline void dmstart(void * next) {
-    asm volatile(" release(%0):at" : : "r"(next));
-    asm volatile(" dmstart(%0)" : : "r"(next));
-}
-
-static inline void dmlink(void * cur, void * next) {
-    asm volatile(" release(%0):at" : : "r"(next));
-    asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
-}
-
-static inline unsigned int dmpoll(void) {
-    unsigned int ret = 0;
-    asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
-    return ret;
-}
-
-static inline unsigned int dmwait(void) {
-    unsigned int ret = 0;
-    asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
-    return ret;
-}
-
-static inline dma_ptr dma_make_ptr(void *dst, const void *src)
-{
-    dma_ptr p = { dst, src };
-    return p;
-}
-
-static inline bool dma_queue_push(dma_queue * q,
-                                  dma_ptr     dptr,
-                                  size_t      dst_row_size,
-                                  size_t      src_row_size,
-                                  size_t      width, // width in bytes. number of bytes to transfer per row
-                                  size_t      nrows) {
-    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
-        FARF(ERROR, "dma-push: queue full\n");
-        return false;
-    }
-
-    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
-
-    desc->next           = NULL;
-    desc->length         = 0;
-    desc->desctype       = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
-    desc->dstbypass      = 1;
-    desc->srcbypass      = 1;
-#if __HVX_ARCH__ >= 73
-    desc->dstbypass      = 1;
-    desc->srcbypass      = 1;
-#else
-    desc->dstbypass      = 0;
-    desc->srcbypass      = 1;
-#endif
-    desc->order          = 0;
-    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
-    desc->src            = (void *) dptr.src;
-    desc->dst            = (void *) dptr.dst;
-    desc->allocation     = 0;
-    desc->padding        = 0;
-    desc->roiwidth       = width;
-    desc->roiheight      = nrows;
-    desc->srcstride      = src_row_size;
-    desc->dststride      = dst_row_size;
-    desc->srcwidthoffset = 0;
-    desc->dstwidthoffset = 0;
-
-    q->dptr[q->push_idx] = dptr;
-
-    dmlink(q->tail, desc);
-    q->tail = desc;
-
-    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
-    q->push_idx = (q->push_idx + 1) & q->idx_mask;
-    return true;
-}
-
-static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
-                                              dma_ptr     dptr,
-                                              size_t      dst_row_size,
-                                              size_t      src_row_size,
-                                              size_t      nrows) {
-    return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
-}
-
-
-static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
-                                              dma_ptr     dptr,
-                                              size_t      dst_row_size,
-                                              size_t      src_row_size,
-                                              size_t      nrows) {
-    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
-}
-
-static inline dma_ptr dma_queue_pop(dma_queue * q) {
-    dma_ptr dptr  = { NULL };
-
-    if (q->push_idx == q->pop_idx) {
-        return dptr;
-    }
-
-    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
-
-    // Wait for desc to complete
-    while (1) {
-        dmpoll();
-        if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
-            break;
-        }
-        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
-    }
-
-    dptr = q->dptr[q->pop_idx];
-
-    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
-    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
-    return dptr;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif /* HTP_DMA_H */
diff --git a/src/ggml-hexagon/htp/htp-msg.h b/src/ggml-hexagon/htp/htp-msg.h
index 846d0617..f49e8ee4 100644
--- a/src/ggml-hexagon/htp/htp-msg.h
+++ b/src/ggml-hexagon/htp/htp-msg.h
@@ -63,6 +63,7 @@ enum htp_op {
     HTP_OP_SET_ROWS       = 15,
     HTP_OP_SCALE          = 16,
     HTP_OP_GET_ROWS       = 17,
+    HTP_OP_CPY            = 18,
     INVALID
 };
 
diff --git a/src/ggml-hexagon/htp/htp-ops.h b/src/ggml-hexagon/htp/htp-ops.h
index 7c828ae6..602a2775 100644
--- a/src/ggml-hexagon/htp/htp-ops.h
+++ b/src/ggml-hexagon/htp/htp-ops.h
@@ -4,11 +4,12 @@
 #include "htp-ctx.h"
 #include "htp-msg.h"
 #include "worker-pool.h"
-#include "ops-utils.h"
 
 #include <assert.h>
 #include <stdint.h>
 
+#include <hex-fastdiv.h>
+
 // ggml-common.h must be included prior to this header
 
 struct htp_spad {
@@ -74,6 +75,14 @@ struct htp_ops_context {
     struct fastdiv_values get_rows_div_ne10;      // fastdiv values for ne10
     struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
 
+    struct fastdiv_values cpy_div_ne01; // fastdiv values for ne01
+    struct fastdiv_values cpy_div_ne02; // fastdiv values for ne02
+    struct fastdiv_values cpy_div_ne03; // fastdiv values for ne03
+
+    struct fastdiv_values cpy_rshp_div_n0;       // fastdiv values for ne00
+    struct fastdiv_values cpy_rshp_div_n1n0;     // fastdiv values for ne00*ne01
+    struct fastdiv_values cpy_rshp_div_n2n1n0;   // fastdiv values for ne00*ne01*ne02
+
     uint32_t flags;
 };
 
@@ -88,5 +97,6 @@ int op_rope(struct htp_ops_context * octx);
 int op_flash_attn_ext(struct htp_ops_context * octx);
 int op_set_rows(struct htp_ops_context * octx);
 int op_get_rows(struct htp_ops_context * octx);
+int op_cpy(struct htp_ops_context * octx);
 
 #endif /* HTP_OPS_H */
diff --git a/src/ggml-hexagon/htp/hvx-arith.h b/src/ggml-hexagon/htp/hvx-arith.h
new file mode 100644
index 00000000..3449739a
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-arith.h
@@ -0,0 +1,457 @@
+#ifndef HVX_ARITH_H
+#define HVX_ARITH_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "hvx-base.h"
+#include "hex-utils.h"
+
+//
+// Binary operations (add, mul, sub)
+//
+
+#define hvx_arith_loop_body(dst_type, src0_type, src1_type, vec_store, vec_op) \
+    do {                                                                       \
+        dst_type * restrict vdst  = (dst_type *) dst;                          \
+        src0_type * restrict vsrc0 = (src0_type *) src0;                       \
+        src1_type * restrict vsrc1 = (src1_type *) src1;                       \
+                                                                               \
+        const uint32_t elem_size = sizeof(float);                              \
+        const uint32_t epv  = 128 / elem_size;                                 \
+        const uint32_t nvec = n / epv;                                         \
+        const uint32_t nloe = n % epv;                                         \
+                                                                               \
+        uint32_t i = 0;                                                        \
+                                                                               \
+        _Pragma("unroll(4)")                                                   \
+        for (; i < nvec; i++) {                                                \
+            vdst[i] = vec_op(vsrc0[i], vsrc1[i]);                              \
+        }                                                                      \
+        if (nloe) {                                                            \
+            HVX_Vector v = vec_op(vsrc0[i], vsrc1[i]);                         \
+            vec_store((void *) &vdst[i], nloe * elem_size, v);                 \
+        }                                                                      \
+    } while(0)
+
+#if __HVX_ARCH__ < 79
+#define HVX_OP_ADD(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
+#define HVX_OP_SUB(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
+#define HVX_OP_MUL(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
+#else
+#define HVX_OP_ADD(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
+#define HVX_OP_SUB(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
+#define HVX_OP_MUL(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
+#endif
+
+// ADD variants
+
+static inline void hvx_add_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD);
+}
+
+static inline void hvx_add_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD);
+}
+
+static inline void hvx_add_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD);
+}
+
+static inline void hvx_add_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD);
+}
+
+// SUB variants
+
+static inline void hvx_sub_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB);
+}
+
+static inline void hvx_sub_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB);
+}
+
+static inline void hvx_sub_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB);
+}
+
+static inline void hvx_sub_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB);
+}
+
+// MUL variants
+
+static inline void hvx_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL);
+}
+
+static inline void hvx_mul_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL);
+}
+
+static inline void hvx_mul_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL);
+}
+
+static inline void hvx_mul_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
+    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL);
+}
+
+// Dispatchers
+
+static inline void hvx_add_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
+        if (hex_is_aligned((void *) src1, 128)) {
+            hvx_add_f32_aa(dst, src0, src1, num_elems);
+        } else {
+            hvx_add_f32_au(dst, src0, src1, num_elems);
+        }
+    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
+        hvx_add_f32_ua(dst, src0, src1, num_elems);
+    } else {
+        hvx_add_f32_uu(dst, src0, src1, num_elems);
+    }
+}
+
+static inline void hvx_sub_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
+        if (hex_is_aligned((void *) src1, 128)) {
+            hvx_sub_f32_aa(dst, src0, src1, num_elems);
+        } else {
+            hvx_sub_f32_au(dst, src0, src1, num_elems);
+        }
+    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
+        hvx_sub_f32_ua(dst, src0, src1, num_elems);
+    } else {
+        hvx_sub_f32_uu(dst, src0, src1, num_elems);
+    }
+}
+
+static inline void hvx_mul_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
+        if (hex_is_aligned((void *) src1, 128)) {
+            hvx_mul_f32_aa(dst, src0, src1, num_elems);
+        } else {
+            hvx_mul_f32_au(dst, src0, src1, num_elems);
+        }
+    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
+        hvx_mul_f32_ua(dst, src0, src1, num_elems);
+    } else {
+        hvx_mul_f32_uu(dst, src0, src1, num_elems);
+    }
+}
+
+// Mul-Mul Optimized
+
+static inline void hvx_mul_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint8_t * restrict src2, const uint32_t num_elems) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src0 % 128 == 0);
+    assert((unsigned long) src1 % 128 == 0);
+    assert((unsigned long) src2 % 128 == 0);
+
+    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
+    HVX_Vector * restrict vsrc0 = (HVX_Vector *) src0;
+    HVX_Vector * restrict vsrc1 = (HVX_Vector *) src1;
+    HVX_Vector * restrict vsrc2 = (HVX_Vector *) src2;
+
+    const uint32_t elem_size = sizeof(float);
+    const uint32_t epv  = 128 / elem_size;
+    const uint32_t nvec = num_elems / epv;
+    const uint32_t nloe = num_elems % epv;
+
+    uint32_t i = 0;
+
+    _Pragma("unroll(4)")
+    for (; i < nvec; i++) {
+        HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]);
+        vdst[i] = HVX_OP_MUL(v1, vsrc2[i]);
+    }
+
+    if (nloe) {
+        HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]);
+        HVX_Vector v2 = HVX_OP_MUL(v1, vsrc2[i]);
+        hvx_vec_store_a((void *) &vdst[i], nloe * elem_size, v2);
+    }
+}
+
+// Scalar Operations
+
+#define hvx_scalar_loop_body(dst_type, src_type, vec_store, scalar_op_macro)   \
+    do {                                                                       \
+        dst_type * restrict vdst = (dst_type *) dst;                           \
+        src_type * restrict vsrc = (src_type *) src;                           \
+                                                                               \
+        const uint32_t elem_size = sizeof(float);                              \
+        const uint32_t epv  = 128 / elem_size;                                 \
+        const uint32_t nvec = n / epv;                                         \
+        const uint32_t nloe = n % epv;                                         \
+                                                                               \
+        uint32_t i = 0;                                                        \
+                                                                               \
+        _Pragma("unroll(4)")                                                   \
+        for (; i < nvec; i++) {                                                \
+            HVX_Vector v = vsrc[i];                                            \
+            vdst[i] = scalar_op_macro(v);                                      \
+        }                                                                      \
+        if (nloe) {                                                            \
+            HVX_Vector v = vsrc[i];                                            \
+            v = scalar_op_macro(v);                                            \
+            vec_store((void *) &vdst[i], nloe * elem_size, v);                 \
+        }                                                                      \
+    } while(0)
+
+#define HVX_OP_ADD_SCALAR(v) \
+    ({ \
+        const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v); \
+        HVX_Vector out = HVX_OP_ADD(v, val_vec); \
+        Q6_V_vmux_QVV(pred_inf, inf, out); \
+    })
+
+#define HVX_OP_MUL_SCALAR(v) HVX_OP_MUL(v, val_vec)
+#define HVX_OP_SUB_SCALAR(v) HVX_OP_SUB(v, val_vec)
+
+// Add Scalar Variants
+
+static inline void hvx_add_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD_SCALAR);
+}
+
+static inline void hvx_add_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
+    assert((unsigned long) dst % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD_SCALAR);
+}
+
+static inline void hvx_add_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD_SCALAR);
+}
+
+static inline void hvx_add_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    static const float kInf = INFINITY;
+    const HVX_Vector inf = hvx_vec_splat_f32(kInf);
+    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD_SCALAR);
+}
+
+// Sub Scalar Variants
+
+static inline void hvx_sub_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB_SCALAR);
+}
+
+static inline void hvx_sub_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB_SCALAR);
+}
+
+static inline void hvx_sub_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB_SCALAR);
+}
+
+static inline void hvx_sub_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB_SCALAR);
+}
+
+// Mul Scalar Variants
+
+static inline void hvx_mul_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL_SCALAR);
+}
+
+static inline void hvx_mul_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL_SCALAR);
+}
+
+static inline void hvx_mul_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL_SCALAR);
+}
+
+static inline void hvx_mul_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL_SCALAR);
+}
+
+static inline void hvx_add_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
+        hvx_add_scalar_f32_aa(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) dst, 128)) {
+        hvx_add_scalar_f32_au(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) src, 128)) {
+        hvx_add_scalar_f32_ua(dst, src, val, num_elems);
+    } else {
+        hvx_add_scalar_f32_uu(dst, src, val, num_elems);
+    }
+}
+
+static inline void hvx_mul_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
+        hvx_mul_scalar_f32_aa(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) dst, 128)) {
+        hvx_mul_scalar_f32_au(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) src, 128)) {
+        hvx_mul_scalar_f32_ua(dst, src, val, num_elems);
+    } else {
+        hvx_mul_scalar_f32_uu(dst, src, val, num_elems);
+    }
+}
+
+static inline void hvx_sub_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
+        hvx_sub_scalar_f32_aa(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) dst, 128)) {
+        hvx_sub_scalar_f32_au(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) src, 128)) {
+        hvx_sub_scalar_f32_ua(dst, src, val, num_elems);
+    } else {
+        hvx_sub_scalar_f32_uu(dst, src, val, num_elems);
+    }
+}
+
+// MIN Scalar variants
+
+#define HVX_OP_MIN_SCALAR(v) Q6_Vsf_vmin_VsfVsf(val_vec, v)
+
+static inline void hvx_min_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MIN_SCALAR);
+}
+
+static inline void hvx_min_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) dst % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MIN_SCALAR);
+}
+
+static inline void hvx_min_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MIN_SCALAR);
+}
+
+static inline void hvx_min_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
+    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
+    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MIN_SCALAR);
+}
+
+static inline void hvx_min_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
+        hvx_min_scalar_f32_aa(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) dst, 128)) {
+        hvx_min_scalar_f32_au(dst, src, val, num_elems);
+    } else if (hex_is_aligned((void *) src, 128)) {
+        hvx_min_scalar_f32_ua(dst, src, val, num_elems);
+    } else {
+        hvx_min_scalar_f32_uu(dst, src, val, num_elems);
+    }
+}
+
+// CLAMP Scalar variants
+
+#define HVX_OP_CLAMP_SCALAR(v) \
+    ({ \
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(v, max_vec); \
+        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(min_vec, v); \
+        HVX_Vector tmp = Q6_V_vmux_QVV(pred_cap_right, max_vec, v); \
+        Q6_V_vmux_QVV(pred_cap_left, min_vec, tmp); \
+    })
+
+static inline void hvx_clamp_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
+    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
+    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR);
+}
+
+static inline void hvx_clamp_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
+    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
+    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
+    assert((unsigned long) dst % 128 == 0);
+    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR);
+}
+
+static inline void hvx_clamp_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
+    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
+    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
+    assert((unsigned long) src % 128 == 0);
+    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR);
+}
+
+static inline void hvx_clamp_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
+    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
+    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
+    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR);
+}
+
+static inline void hvx_clamp_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, const int num_elems) {
+    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
+        hvx_clamp_scalar_f32_aa(dst, src, min, max, num_elems);
+    } else if (hex_is_aligned((void *) dst, 128)) {
+        hvx_clamp_scalar_f32_au(dst, src, min, max, num_elems);
+    } else if (hex_is_aligned((void *) src, 128)) {
+        hvx_clamp_scalar_f32_ua(dst, src, min, max, num_elems);
+    } else {
+        hvx_clamp_scalar_f32_uu(dst, src, min, max, num_elems);
+    }
+}
+
+#undef HVX_OP_ADD
+#undef HVX_OP_SUB
+#undef HVX_OP_MUL
+#undef hvx_arith_loop_body
+#undef HVX_OP_ADD_SCALAR
+#undef HVX_OP_SUB_SCALAR
+#undef HVX_OP_MUL_SCALAR
+#undef hvx_scalar_loop_body
+#undef HVX_OP_MIN_SCALAR
+#undef HVX_OP_CLAMP_SCALAR
+
+#endif // HVX_ARITH_H
diff --git a/src/ggml-hexagon/htp/hvx-base.h b/src/ggml-hexagon/htp/hvx-base.h
new file mode 100644
index 00000000..ffa6e18e
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-base.h
@@ -0,0 +1,167 @@
+#ifndef HVX_BASE_H
+#define HVX_BASE_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hex-utils.h"
+#include "hvx-types.h"
+
+static inline void hvx_vec_store_u(void * restrict dst, uint32_t n, HVX_Vector v) {
+    // Rotate as needed.
+    v = Q6_V_vlalign_VVR(v, v, (size_t) dst);
+
+    uint32_t left_off  = (size_t) dst & 127;
+    uint32_t right_off = left_off + n;
+
+    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) dst);
+    HVX_VectorPred qr     = Q6_Q_vsetq2_R(right_off);
+
+    if (right_off > 128) {
+        Q6_vmem_QRIV(qr, (HVX_Vector *) dst + 1, v);
+        // all 1's
+        qr = Q6_Q_vcmp_eq_VbVb(v, v);
+    }
+
+    ql_not = Q6_Q_or_QQn(ql_not, qr);
+    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) dst, v);
+}
+
+static inline void hvx_vec_store_a(void * restrict dst, uint32_t n, HVX_Vector v) {
+    assert((unsigned long) dst % 128 == 0);
+    HVX_VectorPred m = Q6_Q_or_QQn(Q6_Q_vsetq_R((unsigned long) dst), Q6_Q_vsetq2_R(n));
+    Q6_vmem_QnRIV(m, (HVX_Vector *) dst, v);
+}
+
+static inline HVX_Vector hvx_vec_splat_f32(float v) {
+    union { float  f; uint32_t i; } u = { .f = v };
+    return Q6_V_vsplat_R(u.i);
+}
+
+static inline HVX_Vector hvx_vec_splat_f16(float v) {
+    union { __fp16 f; uint16_t i; } u = { .f = v };
+    return Q6_Vh_vsplat_R(u.i);
+}
+
+static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
+    // vdelta control to replicate first 4 bytes across all elements
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+    };
+
+    HVX_Vector ctrl = *(HVX_Vector *) repl;
+    return Q6_V_vdelta_VV(v, ctrl);
+}
+
+static inline float hvx_vec_get_f32(HVX_Vector v) {
+    float __attribute__((aligned(128))) x;
+    hvx_vec_store_a(&x, 4, v);
+    return x;
+}
+
+static inline HVX_Vector hvx_vec_abs_f16(HVX_Vector v) {
+    // abs by clearing the fp16 sign bit
+    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
+    return Q6_V_vand_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_neg_f16(HVX_Vector v) {
+    // neg by setting the fp16 sign bit
+    HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
+    return Q6_V_vxor_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_abs_f32(HVX_Vector v) {
+    // abs by clearing the fp32 sign bit
+    HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
+    return Q6_V_vand_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_neg_f32(HVX_Vector v) {
+#if __HVX_ARCH__ > 75
+    return Q6_Vsf_vfneg_Vsf(v);
+#else
+    // neg by setting the fp32 sign bit
+    HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
+    return Q6_V_vxor_VV(v, mask);
+#endif  // __HVX_ARCH__ > 75
+}
+
+static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
+    const HVX_Vector vnan_exp  = Q6_Vh_vsplat_R(0x7C00);
+    const HVX_Vector vnan_frac = Q6_Vh_vsplat_R(0x7FFF);
+
+    // get pred of which are NaN, i.e., exponent bits all 1s and fraction bits non 0s
+    HVX_VectorPred p_exp  = Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_exp), vnan_exp);
+    HVX_VectorPred p_frac = Q6_Q_not_Q(Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_frac), vnan_exp));
+    return Q6_Q_and_QQ(p_exp, p_frac);
+}
+
+static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) {
+    const HVX_Vector zero = Q6_V_vsplat_R(0);
+    HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
+    HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
+    HVX_Vector  v = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0)));
+
+#if __HVX_ARCH__ < 79
+    // replace NaNs with -INF, older arches produce NaNs for (-INF + 0.0)
+    const HVX_Vector neg_inf = hvx_vec_splat_f16(-INFINITY);
+    HVX_VectorPred nan = hvx_vec_is_nan_f16(v);
+    v = Q6_V_vmux_QVV(nan, neg_inf, v);
+#endif
+
+    return v;
+}
+
+/* Q6_Vsf_equals_Vw is only available on v73+.*/
+#if __HVX_ARCH__ < 73
+static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
+{
+    HVX_Vector const vzero = Q6_V_vzero();
+    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
+    return ret;
+}
+
+static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
+{
+    return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
+}
+#endif
+
+static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
+    // This looks complicated.
+    // Ideally should just be Q6_Vh_equals_Vhf(vin)
+    // but that instruction does not do proper rounding.
+
+    // convert to qf32, multiplying by 1.0 in the process.
+    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
+
+    // 'in-range' values are +/32752.
+    // add 192K to it, convert to sf
+    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
+    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
+    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
+
+    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
+    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
+    // Start by <<10 to get the final 'sign' bit in bit 15...
+    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
+    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
+
+    // now round down to 16
+    return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
+}
+
+#endif /* HVX_BASE_H */
diff --git a/src/ggml-hexagon/htp/hvx-copy.h b/src/ggml-hexagon/htp/hvx-copy.h
new file mode 100644
index 00000000..6b617b76
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-copy.h
@@ -0,0 +1,247 @@
+#ifndef HVX_COPY_H
+#define HVX_COPY_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+
+#define hvx_splat_loop_body(dst_type, vec_store)                 \
+    do {                                                         \
+        dst_type * restrict vdst = (dst_type *) dst;             \
+                                                                 \
+        uint32_t nvec = n / (128 / elem_size);                   \
+        uint32_t nloe = n % (128 / elem_size);                   \
+                                                                 \
+        uint32_t i = 0;                                          \
+                                                                 \
+        _Pragma("unroll(4)")                                     \
+        for (; i < nvec; i++) {                                  \
+            vdst[i] = src;                                       \
+        }                                                        \
+        if (nloe) {                                              \
+            vec_store((void *) &vdst[i], nloe * elem_size, src); \
+        }                                                        \
+    } while(0)
+
+static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
+    hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u);
+}
+
+static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) {
+    hvx_splat_a(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
+}
+
+static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) {
+    hvx_splat_u(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
+}
+
+static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) {
+    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
+}
+
+static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) {
+    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
+}
+
+#define hvx_copy_loop_body(dst_type, src_type, vec_store)            \
+    do {                                                             \
+        dst_type * restrict vdst = (dst_type *) dst;                 \
+        src_type * restrict vsrc = (src_type *) src;                 \
+                                                                     \
+        const uint32_t epv  = 128 / elem_size;                       \
+        const uint32_t nvec = n / epv;                               \
+        const uint32_t nloe = n % epv;                               \
+                                                                     \
+        uint32_t i = 0;                                              \
+                                                                     \
+        _Pragma("unroll(4)")                                         \
+        for (; i < nvec; i++) { vdst[i] = vsrc[i]; }                 \
+        if (nloe) {                                                  \
+            vec_store((void *) &vdst[i], nloe * elem_size, vsrc[i]); \
+        }                                                            \
+    } while(0)
+
+// Generic copy routines
+static inline void hvx_copy_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_copy_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_copy_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+static inline void hvx_copy_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+static inline void hvx_copy_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
+    hvx_copy_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
+static inline void hvx_copy_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_aa(dst, src, n, sizeof(__fp16));
+}
+
+// copy n fp16 elements : source is aligned, destination is potentially unaligned
+static inline void hvx_copy_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_au(dst, src, n, sizeof(__fp16));
+}
+
+// copy n fp16 elements : source is aligned, destination is potentially unaligned
+static inline void hvx_copy_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_ua(dst, src, n, sizeof(__fp16));
+}
+
+// copy n fp16 elements : source is aligned, destination is potentially unaligned
+static inline void hvx_copy_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_uu(dst, src, n, sizeof(__fp16));
+}
+
+// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
+static inline void hvx_copy_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_aa(dst, src, n, sizeof(float));
+}
+
+// copy n fp32 elements : source is aligned, destination is unaligned
+static inline void hvx_copy_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_ua(dst, src, n, sizeof(float));
+}
+
+// copy n fp32 elements : source is unaligned, destination is aligned
+static inline void hvx_copy_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_au(dst, src, n, sizeof(float));
+}
+
+// copy n fp32 elements : source is unaligned, destination unaligned
+static inline void hvx_copy_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_uu(dst, src, n, sizeof(float));
+}
+
+//// fp32 -> fp16
+
+#define hvx_copy_f16_f32_loop_body(dst_type, src_type, vec_store)                   \
+    do {                                                                            \
+        dst_type * restrict vdst = (dst_type *) dst;                                \
+        src_type * restrict vsrc = (src_type *) src;                                \
+                                                                                    \
+        const HVX_Vector zero = Q6_V_vsplat_R(0);                                   \
+                                                                                    \
+        const uint32_t elem_size = sizeof(__fp16);                                  \
+        const uint32_t epv  = 128 / elem_size;                                      \
+        const uint32_t nvec = n / epv;                                              \
+        const uint32_t nloe = n % epv;                                              \
+                                                                                    \
+        uint32_t i = 0;                                                             \
+                                                                                    \
+        _Pragma("unroll(4)")                                                        \
+        for (; i < nvec; i++) {                                                     \
+            vdst[i] = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);                 \
+        }                                                                           \
+        if (nloe) {                                                                 \
+            HVX_Vector v = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);            \
+            vec_store((void *) &vdst[i], nloe * elem_size, v);                      \
+        }                                                                           \
+    } while(0)
+
+// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is aligned
+static inline void hvx_copy_f16_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
+static inline void hvx_copy_f16_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
+static inline void hvx_copy_f16_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
+static inline void hvx_copy_f16_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+//// fp16 -> fp32
+
+#define hvx_copy_f32_f16_loop_body(dst_type, src_type, vec_store)                   \
+    do {                                                                            \
+        dst_type * restrict vdst = (dst_type *) dst;                                \
+        src_type * restrict vsrc = (src_type *) src;                                \
+                                                                                    \
+        const HVX_Vector one = hvx_vec_splat_f16(1.0);                              \
+                                                                                    \
+        const uint32_t elem_size = sizeof(__fp16);                                  \
+        const uint32_t epv  = 128 / elem_size;                                      \
+        const uint32_t nvec = n / epv;                                              \
+              uint32_t nloe = n % epv;                                              \
+                                                                                    \
+        uint32_t i = 0;                                                             \
+                                                                                    \
+        _Pragma("unroll(4)")                                                        \
+        for (i = 0; i < nvec; ++i) {                                                \
+            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
+            vdst[i*2]   = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p));                        \
+            vdst[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p));                        \
+        }                                                                           \
+                                                                                    \
+        if (nloe) {                                                                 \
+            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
+                                                                                    \
+            HVX_Vector vd = Q6_V_lo_W(p);                                           \
+            i = 2 * i;                                                              \
+                                                                                    \
+            if (nloe >= 32) {                                                       \
+                vdst[i] = Q6_Vsf_equals_Vqf32(vd);                                  \
+                nloe -= 32; ++i; vd = Q6_V_hi_W(p);                                 \
+            }                                                                       \
+                                                                                    \
+            if (nloe) {                                                             \
+                vd = Q6_Vsf_equals_Vqf32(vd);                                       \
+                hvx_vec_store_u(&vdst[i], nloe * sizeof(float), vd);                \
+            }                                                                       \
+        }                                                                           \
+    } while(0)
+
+// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is aligned
+static inline void hvx_copy_f32_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is aligned
+static inline void hvx_copy_f32_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is unaligned
+static inline void hvx_copy_f32_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is unaligned
+static inline void hvx_copy_f32_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+#endif // HVX_COPY_H
diff --git a/src/ggml-hexagon/htp/hvx-dump.h b/src/ggml-hexagon/htp/hvx-dump.h
new file mode 100644
index 00000000..e8822278
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-dump.h
@@ -0,0 +1,132 @@
+#ifndef HVX_DUMP_H
+#define HVX_DUMP_H
+
+#include <HAP_farf.h>
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hex-utils.h"
+#include "hvx-types.h"
+
+static void hvx_vec_dump_f16_n(char * pref, HVX_Vector v, uint32_t n) {
+    HVX_VectorAlias u = { .v = v };
+
+    const uint32_t n0 = n / 16;
+    const uint32_t n1 = n % 16;
+    int            i  = 0;
+    for (; i < n0; i++) {
+        hex_dump_f16_line(pref, u.fp16 + (16 * i), 16);
+    }
+    if (n1) {
+        hex_dump_f16_line(pref, u.fp16 + (16 * i), n1);
+    }
+}
+
+static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
+    hvx_vec_dump_f16_n(pref, v, 64);
+}
+
+static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
+    union {
+        HVX_Vector v;
+        float      d[32];
+    } u = { .v = v };
+
+    const uint32_t n0 = n / 16;
+    const uint32_t n1 = n % 16;
+    int            i  = 0;
+    for (; i < n0; i++) {
+        hex_dump_f32_line(pref, u.d + (16 * i), 16);
+    }
+    if (n1) {
+        hex_dump_f32_line(pref, u.d + (16 * i), n1);
+    }
+}
+
+static void hvx_vec_dump_f32_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        float      d[32];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ...  %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
+         u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
+}
+
+static void hvx_vec_dump_f32(char * pref, HVX_Vector v) {
+    hvx_vec_dump_f32_n(pref, v, 32);
+}
+
+static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int32_t    d[32];
+    } u = { .v = v };
+
+    for (int i = 0; i < 32 / 16; i++) {
+        hex_dump_int32_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int32_t    d[32];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
+         u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
+}
+
+static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
+         u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
+}
+
+static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } u = { .v = v };
+
+    for (int i = 0; i < 128 / 16; i++) {
+        hex_dump_int8_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        uint8_t    d[128];
+    } u = { .v = v };
+
+    for (int i = 0; i < 128 / 16; i++) {
+        hex_dump_uint8_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
+    typedef union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } U;
+
+    U u0 = { .v = v0 };
+    U u1 = { .v = v1 };
+
+    for (int i = 0; i < n; i++) {
+        if (u0.d[i] != u1.d[i]) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+#endif /* HVX_DUMP_H */
diff --git a/src/ggml-hexagon/htp/hvx-exp.c b/src/ggml-hexagon/htp/hvx-exp.c
deleted file mode 100644
index 21bf46a5..00000000
--- a/src/ggml-hexagon/htp/hvx-exp.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
-#include <math.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-dma.h"
-#include "htp-msg.h"
-#include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
-
-static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
-    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
-
-    HVX_Vector out = hvx_vec_exp_fp32(in_vec);
-
-    return Q6_V_vmux_QVV(pred0, inf, out);
-}
-
-void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    HVX_Vector vec_out = Q6_V_vzero();
-
-    static const float kInf    = INFINITY;
-    static const float kMaxExp = 88.02f;  // log(INF)
-
-    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector inf     = hvx_vec_splat_fp32(kInf);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
-        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            if (true == negate) {
-                HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
-            } else {
-                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
-            }
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            if (true == negate) {
-                HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
-            } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
-            }
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        if (true == negate) {
-            HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
-
-            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
-        } else {
-            vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
-        }
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
-    }
-}
diff --git a/src/ggml-hexagon/htp/hvx-exp.h b/src/ggml-hexagon/htp/hvx-exp.h
new file mode 100644
index 00000000..44dfe232
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-exp.h
@@ -0,0 +1,215 @@
+#ifndef HVX_EXP_H
+#define HVX_EXP_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+#include "hvx-floor.h"
+
+#define EXP_COEFF_5 (0x39506967)  // 0.000198757 = 1/(7!)
+#define EXP_COEFF_4 (0x3AB743CE)  // 0.0013982   = 1/(6!)
+#define EXP_COEFF_3 (0x3C088908)  // 0.00833345  = 1/(5!)
+#define EXP_COEFF_2 (0x3D2AA9C1)  // 0.416658    = 1/(4!)
+#define EXP_COEFF_1 (0x3E2AAAAA)  // 0.16666667  = 1/(3!)
+#define EXP_COEFF_0 (0x3F000000)  // 0.5         = 1/(2!)
+#define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
+#define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
+#define EXP_ONE     (0x3f800000)  // 1.0
+#define EXP_RANGE_R (0x41a00000)  // 20.0
+#define EXP_RANGE_L (0xc1a00000)  // -20.0
+
+static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
+    HVX_Vector z_qf32_v;
+    HVX_Vector x_v;
+    HVX_Vector x_qf32_v;
+    HVX_Vector y_v;
+    HVX_Vector k_v;
+    HVX_Vector f_v;
+    HVX_Vector epsilon_v;
+    HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
+    HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
+    HVX_Vector E_const;
+    HVX_Vector zero_v = Q6_V_vzero();
+
+    // exp(x) is approximated as follows:
+    //   f = floor(x/ln(2)) = floor(x*log2(e))
+    //   epsilon = x - f*ln(2)
+    //   exp(x) = exp(epsilon+f*ln(2))
+    //          = exp(epsilon)*exp(f*ln(2))
+    //          = exp(epsilon)*2^f
+    //
+    //   Since epsilon is close to zero, it can be approximated with its Taylor series:
+    //            exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
+    //   Preserving the first eight elements, we get:
+    //            exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
+    //                   =  1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
+
+    HVX_Vector temp_v = in_vec;
+
+    // Clamp inputs to (-20.0, 20.0)
+    HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
+    HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
+
+    in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
+    in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
+
+    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
+    epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
+
+    //    f_v is the floating point result and k_v is the integer result
+    f_v = hvx_vec_floor_f32(epsilon_v);
+    k_v = hvx_vec_truncate_f32(f_v);
+
+    x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
+
+    //  x = x - f_v * logn2;
+    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
+    x_qf32_v  = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
+    // normalize before every QFloat's vmpy
+    x_qf32_v  = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
+
+    // z = x * x;
+    z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
+    z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
+
+    x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
+
+    // y = E4 + E5 * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_5);
+    y_v     = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
+    E_const = Q6_V_vsplat_R(EXP_COEFF_4);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E3 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_3);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E2 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_2);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E1 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_1);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E0 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_0);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = x + y * z;
+    y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
+    y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = y + 1.0;
+    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
+
+    // insert exponents
+    //        y = ldexpf(y, k);
+    //    y_v += k_v; // qf32
+    // modify exponent
+
+    y_v = Q6_Vsf_equals_Vqf32(y_v);
+
+    // add k_v to the exponent of y_v
+    HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
+
+    y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
+    y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
+
+    // exponent cannot be negative; if overflow is detected, result is set to zero
+    HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
+
+    y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
+
+    y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
+
+    return y_v;
+}
+
+static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
+    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
+
+    HVX_Vector out = hvx_vec_exp_f32(in_vec);
+
+    return Q6_V_vmux_QVV(pred0, inf, out);
+}
+
+static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == hex_is_aligned((void *) src, VLEN)) || (0 == hex_is_aligned((void *) dst, VLEN))) {
+        unaligned_addr = 1;
+    }
+    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+    }
+
+    HVX_Vector vec_out = Q6_V_vzero();
+
+    static const float kInf    = INFINITY;
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
+    const HVX_Vector inf     = hvx_vec_splat_f32(kInf);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            if (true == negate) {
+                HVX_Vector neg_vec_in = hvx_vec_neg_f32(*p_vec_in1++);
+                *p_vec_out++          = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
+            } else {
+                *p_vec_out++ = hvx_vec_exp_f32_guard(*p_vec_in1++, max_exp, inf);
+            }
+        }
+    } else {
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            if (true == negate) {
+                HVX_Vector neg_vec_in                    = hvx_vec_neg_f32(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
+            } else {
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(in, max_exp, inf);
+            }
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        if (true == negate) {
+            HVX_Vector neg_vec_in = hvx_vec_neg_f32(in);
+
+            vec_out = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
+        } else {
+            vec_out = hvx_vec_exp_f32_guard(in, max_exp, inf);
+        }
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
+    }
+}
+
+#endif /* HVX_EXP_H */
diff --git a/src/ggml-hexagon/htp/hvx-floor.h b/src/ggml-hexagon/htp/hvx-floor.h
new file mode 100644
index 00000000..6a1bfde5
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-floor.h
@@ -0,0 +1,100 @@
+#ifndef HVX_FLOOR_H
+#define HVX_FLOOR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+
+#define IEEE_VSF_EXPLEN   (8)
+#define IEEE_VSF_EXPBIAS  (127)
+#define IEEE_VSF_EXPMASK  (0xFF)
+#define IEEE_VSF_MANTLEN  (23)
+#define IEEE_VSF_MANTMASK (0x7FFFFF)
+#define IEEE_VSF_MIMPMASK (0x800000)
+
+static inline HVX_Vector hvx_vec_truncate_f32(HVX_Vector in_vec) {
+    HVX_Vector mask_mant_v  = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
+    HVX_Vector mask_impl_v  = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
+    HVX_Vector const_zero_v = Q6_V_vzero();
+
+    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
+
+    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
+    expval_v &= IEEE_VSF_EXPMASK;
+    expval_v -= IEEE_VSF_EXPBIAS;
+
+    // negative exp == fractional value
+    HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
+
+    HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v;         // fractional bits - exp shift
+
+    HVX_Vector mant_v = in_vec & mask_mant_v;                  // obtain mantissa
+    HVX_Vector vout   = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v);  // add implicit 1.0
+
+    vout = Q6_Vw_vasr_VwVw(vout, rshift_v);                    // shift to obtain truncated integer
+    vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout);        // expval<0 -> 0
+
+    HVX_Vector neg_vout = -vout;
+
+    vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout);  // handle negatives
+
+    return (vout);
+}
+
+static inline HVX_Vector hvx_vec_floor_f32(HVX_Vector in_vec) {
+    HVX_Vector mask_mant_v    = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
+    HVX_Vector mask_impl_v    = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
+    HVX_Vector const_mnlen_v  = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
+    HVX_Vector const_zero_v   = Q6_V_vzero();
+    HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000);  // -1 IEEE vsf
+
+    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
+
+    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
+    expval_v &= IEEE_VSF_EXPMASK;
+    expval_v -= IEEE_VSF_EXPBIAS;
+
+    HVX_VectorPred q_negexp     = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
+    HVX_VectorPred q_expltmn    = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
+    HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
+    HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
+
+    // if expval < 0 (q_negexp)         // <0, floor is 0
+    //    if vin > 0
+    //       floor = 0
+    //    if vin < 0
+    //       floor = -1
+    // if expval < mant_len (q_expltmn) // >0, but fraction may exist
+    //    get sign (q_negative)
+    //    mask >> expval                // fraction bits to mask off
+    //    vout = ~(mask)                // apply mask to remove fraction
+    //    if (qneg)                     // negative floor is one less (more, sign bit for neg)
+    //      vout += ((impl_mask) >> expval)
+    //    if (mask && vin)
+    //      vout = vin
+    // else                             // already an integer
+    //    ;                             // no change
+
+    // compute floor
+    mask_mant_v >>= expval_v;
+    HVX_Vector neg_addin_v    = mask_impl_v >> expval_v;
+    HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
+    HVX_Vector vout           = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
+
+    HVX_Vector     mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v);  // chk if bits set
+    HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
+
+    HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v);        // frac bits to clear
+    HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v);  // clear frac bits
+
+    vout = in_vec;
+    vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout);         // expval<mant
+    vout = Q6_V_vmux_QVV(q_integral, in_vec, vout);            // integral values
+    vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout);    // expval<0 x>0 -> 0
+    vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout);  // expval<0 x<0 -> -1
+
+    return vout;
+}
+
+#endif /* HVX_FLOOR_H */
diff --git a/src/ggml-hexagon/htp/hvx-inverse.c b/src/ggml-hexagon/htp/hvx-inverse.c
deleted file mode 100644
index 4d70634f..00000000
--- a/src/ggml-hexagon/htp/hvx-inverse.c
+++ /dev/null
@@ -1,72 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
-#include <math.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-dma.h"
-#include "htp-msg.h"
-#include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
-
-static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
-    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-
-    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_inf_mask);
-    const HVX_VectorPred pred       = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
-
-    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
-}
-
-void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    static const uint32_t kNanInfMask  = 0x7f800000;
-    const HVX_Vector      nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * p_vec_in  = (HVX_Vector *) src;
-        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
-    }
-}
diff --git a/src/ggml-hexagon/htp/hvx-inverse.h b/src/ggml-hexagon/htp/hvx-inverse.h
new file mode 100644
index 00000000..49f3efab
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-inverse.h
@@ -0,0 +1,176 @@
+#ifndef HVX_INVERSE_H
+#define HVX_INVERSE_H
+
+#include <HAP_farf.h>
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+
+// ====================================================
+// FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
+// Order:3; continuity: True; Ends forced: True
+// Mode: unsigned;   Result fractional bits: 14
+// Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
+//      32769  -32706   31252  -10589
+//      32590  -30635   22793   -4493
+//      32066  -27505   16481   -2348
+//      31205  -24054   11849   -1306
+
+static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
+    // input is 0..0xffff representing 0.0  .. 1.0
+    HVX_Vector p;
+    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
+    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
+    return p;  // signed result, 14 fractional bits
+}
+
+// Find reciprocal of fp16.
+// (1) first, convert to fp32, multiplying by 1.0; this is done to
+//    handle denormals. Ignoring sign and zero, result should be at
+//    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
+//    (exponent in range [103,143])
+// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
+// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
+// (4) convert that to fp16
+// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
+//     the result with the max value.
+static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
+    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
+    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
+    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
+    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
+    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
+
+    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
+    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
+    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
+
+    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
+    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
+    // likewise extract the upper 16 from each, containing the exponents in range 103..142
+    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
+    //Get exponent in IEEE 32-bit representation
+    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
+
+    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
+    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
+    // Use poly to transform to 1/x, with 14 fractional bits
+    //
+    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
+
+    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
+
+    // Get mantissa for 16-bit represenation
+    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
+
+    //Compute Reciprocal Exponent
+    HVX_Vector exp_recip =
+        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
+    //Convert it for 16-bit representation
+    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
+    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
+
+    //Merge exponent and mantissa for reciprocal
+    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
+    // map 'small' inputs to standard largest value 0x7bff
+    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
+    // add sign back
+    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
+    return recip;
+}
+
+static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
+    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
+    HVX_Vector two_sf       = hvx_vec_splat_f32(2.0);
+
+    // First approximation
+    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
+
+    HVX_Vector r_qf;
+
+    // Refine
+    r_qf = Q6_Vqf32_vmpy_VsfVsf(
+        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+
+    return Q6_Vsf_equals_Vqf32(r_qf);
+}
+
+static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
+    HVX_Vector out = hvx_vec_inverse_f32(v_sf);
+
+    HVX_Vector     masked_out = Q6_V_vand_VV(out, nan_inf_mask);
+    const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
+
+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
+}
+
+#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store)             \
+    do {                                                                     \
+        dst_type * restrict vdst = (dst_type *) dst;                         \
+        src_type * restrict vsrc = (src_type *) src;                         \
+                                                                             \
+        const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000);           \
+                                                                             \
+        const uint32_t nvec = n / VLEN_FP32;                                 \
+        const uint32_t nloe = n % VLEN_FP32;                                 \
+                                                                             \
+        uint32_t i = 0;                                                      \
+                                                                             \
+        _Pragma("unroll(4)")                                                 \
+        for (; i < nvec; i++) {                                              \
+             vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask);     \
+        }                                                                    \
+        if (nloe) {                                                          \
+            HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
+            vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v);             \
+        }                                                                    \
+    } while(0)
+
+static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) {
+    if ((unsigned long) dst % 128 == 0) {
+        if ((unsigned long) src % 128 == 0) {
+            hvx_inverse_f32_aa(dst, src, num_elems);
+        } else {
+            hvx_inverse_f32_au(dst, src, num_elems);
+        }
+    } else {
+        if ((unsigned long) src % 128 == 0) {
+            hvx_inverse_f32_ua(dst, src, num_elems);
+        } else {
+            hvx_inverse_f32_uu(dst, src, num_elems);
+        }
+    }
+}
+
+#endif // HVX_INVERSE_H
diff --git a/src/ggml-hexagon/htp/hvx-reduce.h b/src/ggml-hexagon/htp/hvx-reduce.h
new file mode 100644
index 00000000..8845fe73
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-reduce.h
@@ -0,0 +1,225 @@
+#ifndef HVX_REDUCE_H
+#define HVX_REDUCE_H
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "hex-utils.h"
+#include "hvx-base.h"
+#include "hvx-types.h"
+
+static inline HVX_Vector hvx_vec_reduce_sum_n_i32(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // int32
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(sum, width);     // rotate right
+        sum   = Q6_Vw_vadd_VwVw(sum_t, sum);  // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_i32(HVX_Vector in) {
+    return hvx_vec_reduce_sum_n_i32(in, 32);
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_n_qf32(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // fp32 nbytes
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width);  // rotate right
+        sum   = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t);             // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
+    return hvx_vec_reduce_sum_n_qf32(in, 32);
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // fp32 nbytes
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(sum, width);                               // rotate right
+        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t));  // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
+    return hvx_vec_reduce_sum_n_f32(in, 32);
+}
+
+static inline HVX_Vector hvx_vec_reduce_max_f16(HVX_Vector in) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 2;    // fp16 nbytes
+
+    HVX_Vector _max = in, _max_t;
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max2_f16(HVX_Vector in, HVX_Vector _max) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 2;    // fp32 nbytes
+
+    HVX_Vector _max_t;
+
+    _max = Q6_Vhf_vmax_VhfVhf(in, _max);
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max_f32(HVX_Vector in) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 4;    // fp32 nbytes
+
+    HVX_Vector _max = in, _max_t;
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max2_f32(HVX_Vector in, HVX_Vector _max) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 4;    // fp32 nbytes
+
+    HVX_Vector _max_t;
+
+    _max = Q6_Vsf_vmax_VsfVsf(in, _max);
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+#define hvx_reduce_loop_body(src_type, init_vec, pad_vec, vec_op, reduce_op, scalar_reduce) \
+    do {                                                                                    \
+        src_type * restrict vsrc = (src_type *) src;                                        \
+        HVX_Vector acc = init_vec;                                                          \
+                                                                                            \
+        const uint32_t elem_size = sizeof(float);                                           \
+        const uint32_t epv  = 128 / elem_size;                                              \
+        const uint32_t nvec = num_elems / epv;                                              \
+        const uint32_t nloe = num_elems % epv;                                              \
+                                                                                            \
+        uint32_t i = 0;                                                                     \
+        _Pragma("unroll(4)")                                                                \
+        for (; i < nvec; i++) {                                                             \
+            acc = vec_op(acc, vsrc[i]);                                                     \
+        }                                                                                   \
+        if (nloe) {                                                                         \
+            const float * srcf = (const float *) src + i * epv;                             \
+            HVX_Vector in = *(HVX_UVector *) srcf;                                          \
+            HVX_Vector temp = Q6_V_valign_VVR(in, pad_vec, nloe * elem_size);               \
+            acc = vec_op(acc, temp);                                                        \
+        }                                                                                   \
+        HVX_Vector v = reduce_op(acc);                                                      \
+        return scalar_reduce(v);                                                            \
+    } while(0)
+
+#define HVX_REDUCE_MAX_OP(acc, val) Q6_Vsf_vmax_VsfVsf(acc, val)
+#define HVX_REDUCE_SUM_OP(acc, val) Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(acc), val)
+#define HVX_SUM_SQ_OP(acc, val) Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(val, val))
+#define HVX_REDUCE_MAX_SCALAR(v) hvx_vec_get_f32(v)
+#define HVX_REDUCE_SUM_SCALAR(v) hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(v))
+
+// Max variants
+
+static inline float hvx_reduce_max_f32_a(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]);
+    assert((unsigned long) src % 128 == 0);
+    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR);
+}
+
+static inline float hvx_reduce_max_f32_u(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]);
+    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR);
+}
+
+static inline float hvx_reduce_max_f32(const uint8_t * restrict src, const int num_elems) {
+    if (hex_is_aligned((void *) src, 128)) {
+        return hvx_reduce_max_f32_a(src, num_elems);
+    } else {
+        return hvx_reduce_max_f32_u(src, num_elems);
+    }
+}
+
+// Sum variants
+
+static inline float hvx_reduce_sum_f32_a(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = Q6_V_vsplat_R(0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
+}
+
+static inline float hvx_reduce_sum_f32_u(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = Q6_V_vsplat_R(0);
+    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
+}
+
+static inline float hvx_reduce_sum_f32(const uint8_t * restrict src, const int num_elems) {
+    if (hex_is_aligned((void *) src, 128)) {
+        return hvx_reduce_sum_f32_a(src, num_elems);
+    } else {
+        return hvx_reduce_sum_f32_u(src, num_elems);
+    }
+}
+
+// Sum of squares variants
+
+static inline float hvx_sum_of_squares_f32_a(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = Q6_V_vsplat_R(0);
+    assert((uintptr_t) src % 128 == 0);
+    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
+}
+
+static inline float hvx_sum_of_squares_f32_u(const uint8_t * restrict src, const int num_elems) {
+    HVX_Vector init_vec = Q6_V_vsplat_R(0);
+    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
+}
+
+static inline float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
+    if (hex_is_aligned((void *) src, 128)) {
+        return hvx_sum_of_squares_f32_a(src, num_elems);
+    } else {
+        return hvx_sum_of_squares_f32_u(src, num_elems);
+    }
+}
+
+#undef hvx_reduce_loop_body
+#undef HVX_REDUCE_MAX_OP
+#undef HVX_REDUCE_SUM_OP
+#undef HVX_REDUCE_MAX_SCALAR
+#undef HVX_REDUCE_SUM_SCALAR
+#undef HVX_SUM_SQ_OP
+
+#endif /* HVX_REDUCE_H */
diff --git a/src/ggml-hexagon/htp/hvx-scale.h b/src/ggml-hexagon/htp/hvx-scale.h
new file mode 100644
index 00000000..c65c9863
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-scale.h
@@ -0,0 +1,133 @@
+#ifndef HVX_SCALE_H
+#define HVX_SCALE_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+
+#define hvx_scale_f32_loop_body(dst_type, src_type, vec_store)                       \
+    do {                                                                             \
+        dst_type * restrict vdst = (dst_type *) dst;                                 \
+        src_type * restrict vsrc = (src_type *) src;                                 \
+                                                                                     \
+        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
+                                                                                     \
+        const uint32_t elem_size = sizeof(float);                                    \
+        const uint32_t epv = 128 / elem_size;                                        \
+        const uint32_t nvec = n / epv;                                               \
+        const uint32_t nloe = n % epv;                                               \
+                                                                                     \
+        uint32_t i = 0;                                                              \
+                                                                                     \
+        _Pragma("unroll(4)")                                                         \
+        for (; i < nvec; ++i) {                                                      \
+            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
+            vdst[i]      = Q6_Vsf_equals_Vqf32(v);                                   \
+        }                                                                            \
+        if (nloe) {                                                                  \
+            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
+            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
+        }                                                                            \
+    } while(0)
+
+static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
+    assert((size_t) dst % 128 == 0);
+    assert((size_t) src % 128 == 0);
+    hvx_scale_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_scale_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
+    assert((size_t) dst % 128 == 0);
+    hvx_scale_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+static inline void hvx_scale_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
+    assert((size_t) src % 128 == 0);
+    hvx_scale_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
+    hvx_scale_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
+    if (((size_t) dst & 127) == 0) {
+        if (((size_t) src & 127) == 0) {
+            hvx_scale_f32_aa(dst, src, n, scale);
+        } else {
+            hvx_scale_f32_au(dst, src, n, scale);
+        }
+    } else {
+        if (((size_t) src & 127) == 0) {
+            hvx_scale_f32_ua(dst, src, n, scale);
+        } else {
+            hvx_scale_f32_uu(dst, src, n, scale);
+        }
+    }
+}
+
+#define hvx_scale_offset_f32_loop_body(dst_type, src_type, vec_store)                \
+    do {                                                                             \
+        dst_type * restrict vdst = (dst_type *) dst;                                 \
+        src_type * restrict vsrc = (src_type *) src;                                 \
+                                                                                     \
+        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
+        HVX_Vector vo = hvx_vec_splat_f32(offset);                                   \
+                                                                                     \
+        const uint32_t elem_size = sizeof(float);                                    \
+        const uint32_t epv = 128 / elem_size;                                        \
+        const uint32_t nvec = n / epv;                                               \
+        const uint32_t nloe = n % epv;                                               \
+                                                                                     \
+        uint32_t i = 0;                                                              \
+                                                                                     \
+        _Pragma("unroll(4)")                                                         \
+        for (; i < nvec; ++i) {                                                      \
+            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
+            vdst[i] = Q6_Vsf_equals_Vqf32(v);                                        \
+        }                                                                            \
+        if (nloe) {                                                                  \
+            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
+            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
+        }                                                                            \
+    } while(0)
+
+static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
+    assert((size_t) dst % 128 == 0);
+    assert((size_t) src % 128 == 0);
+    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_scale_offset_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
+    assert((size_t) dst % 128 == 0);
+    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+static inline void hvx_scale_offset_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
+    assert((size_t) src % 128 == 0);
+    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
+    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
+    if (((size_t) dst & 127) == 0) {
+        if (((size_t) src & 127) == 0) {
+            hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
+        } else {
+            hvx_scale_offset_f32_au(dst, src, n, scale, offset);
+        }
+    } else {
+        if (((size_t) src & 127) == 0) {
+            hvx_scale_offset_f32_ua(dst, src, n, scale, offset);
+        } else {
+            hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
+        }
+    }
+}
+
+#endif // HVX_SCALE_H
diff --git a/src/ggml-hexagon/htp/hvx-sigmoid.c b/src/ggml-hexagon/htp/hvx-sigmoid.c
deleted file mode 100644
index 15ac6469..00000000
--- a/src/ggml-hexagon/htp/hvx-sigmoid.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
-#include <math.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-dma.h"
-#include "htp-msg.h"
-#include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
-
-#if 0
-// Reference algo used in hvx-utils
-static void fast_sigmoid_f32(const float*  restrict src, float* restrict dst, const int num_elems)
-{
-    const float c1 = 0.03138777;
-    const float c2 = 0.276281267;
-    const float c_log2f = 1.442695022;
-
-    int32_t store_ints[32];
-    float store_floats[3][32];
-
-    for (int i = 0; i < num_elems; i++)
-    {
-        float v = src0[i];
-
-        v *= c_log2f*0.5;
-        int intPart = (int)v;
-        float x = (v - intPart);
-        float xx = x * x;
-        float v1 = c_log2f + c2 * xx;
-        float v2 = x + xx * c1 * x;
-        float v3 = (v2 + v1);
-        *((int*)&v3) += intPart << 24;
-        float v4 = v2 - v1;
-        float v5 = v3 - v4;
-        float res = v3 / v5;
-
-        dst[i] = res;
-    }
-}
-#endif
diff --git a/src/ggml-hexagon/htp/hvx-sigmoid.h b/src/ggml-hexagon/htp/hvx-sigmoid.h
new file mode 100644
index 00000000..1b4aaff0
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-sigmoid.h
@@ -0,0 +1,114 @@
+#ifndef HVX_SIGMOID_H
+#define HVX_SIGMOID_H
+
+#include "hvx-base.h"
+
+#define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
+#define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
+#define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
+#define FAST_SIGMOID_C3    (0x3f000000)  // 0.5
+
+static inline HVX_Vector hvx_vec_fast_sigmoid_f32(HVX_Vector v) {
+    v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
+    v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
+
+    HVX_Vector in_int = hvx_vec_truncate_f32(Q6_Vsf_equals_Vqf32(v));
+    HVX_Vector x      = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
+    HVX_Vector xx     = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
+
+    HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
+    v1            = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
+
+    HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
+    v2            = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
+    v2            = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
+
+    HVX_Vector v3          = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
+    HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
+    v3_exponent            = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
+    v3_exponent            = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
+    v3                     = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
+
+    HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
+    HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
+
+    HVX_Vector res = hvx_vec_inverse_f32(v5);
+    res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
+
+    return Q6_Vsf_equals_Vqf32(res);
+}
+
+static inline HVX_Vector hvx_vec_fast_sigmoid_f32_guard(HVX_Vector v,
+                                                         HVX_Vector one,
+                                                         HVX_Vector max_exp,
+                                                         HVX_Vector min_exp) {
+    const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
+    const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
+
+    HVX_Vector out = hvx_vec_fast_sigmoid_f32(v);
+    out            = Q6_V_vmux_QVV(pred_max, out, one);
+    return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
+}
+
+static inline HVX_Vector hvx_vec_tanh_f32(HVX_Vector x) {
+    // tanh(x) = 2 * sigmoid(2x) - 1
+    HVX_Vector two = hvx_vec_splat_f32(2.0f);
+    HVX_Vector one = hvx_vec_splat_f32(1.0f);
+    HVX_Vector x2  = Q6_Vqf32_vmpy_VsfVsf(x, two);
+
+    HVX_Vector max_exp = hvx_vec_splat_f32(87.f);
+    HVX_Vector min_exp = hvx_vec_splat_f32(-87.f);
+
+    HVX_Vector sig2x = hvx_vec_fast_sigmoid_f32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
+
+    HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
+    res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
+    return Q6_Vsf_equals_Vqf32(res);
+}
+
+#define hvx_sigmoid_loop_body(dst_type, src_type, vec_store)    \
+    do {                                                        \
+        dst_type * restrict vdst = (dst_type *) dst;            \
+        src_type * restrict vsrc = (src_type *) src;            \
+                                                                \
+        const HVX_Vector one     = hvx_vec_splat_f32(1.f);      \
+        const HVX_Vector max_exp = hvx_vec_splat_f32(87.f);     \
+        const HVX_Vector min_exp = hvx_vec_splat_f32(-87.f);    \
+                                                                \
+        const uint32_t epv  = 128 / sizeof(float);              \
+        const uint32_t nvec = n / epv;                          \
+        const uint32_t nloe = n % epv;                          \
+                                                                \
+        uint32_t i = 0;                                         \
+                                                                \
+        _Pragma("unroll(4)")                                    \
+        for (; i < nvec; i++) {                                 \
+             vdst[i] = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \
+        }                                                       \
+        if (nloe) {                                             \
+             HVX_Vector tmp = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \
+             vec_store((void *) &vdst[i], nloe * sizeof(float), tmp); \
+        }                                                       \
+    } while(0)
+
+static inline void hvx_sigmoid_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_sigmoid_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+
+static inline void hvx_sigmoid_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_sigmoid_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+
+static inline void hvx_sigmoid_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_sigmoid_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+
+static inline void hvx_sigmoid_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_sigmoid_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+
+#endif /* HVX_SIGMOID_H */
diff --git a/src/ggml-hexagon/htp/hvx-sqrt.h b/src/ggml-hexagon/htp/hvx-sqrt.h
new file mode 100644
index 00000000..28ee9f68
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-sqrt.h
@@ -0,0 +1,60 @@
+#ifndef HVX_SQRT_H
+#define HVX_SQRT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hex-utils.h"
+
+#include "hvx-base.h"
+
+#define RSQRT_CONST        0x5f3759df  // Constant for fast inverse square root calculation
+#define RSQRT_ONE_HALF     0x3f000000  // 0.5
+#define RSQRT_THREE_HALVES 0x3fc00000  // 1.5
+
+static inline HVX_Vector hvx_vec_rsqrt_f32(HVX_Vector in_vec) {
+    //Algorithm :
+    //  x2 = input*0.5
+    //  y  = * (long *) &input
+    //  y  = 0x5f3759df - (y>>2)
+    //  y  = y*(threehalfs - x2*y*y)
+
+    HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
+    HVX_Vector onehalf    = Q6_V_vsplat_R(RSQRT_ONE_HALF);
+    HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
+
+    HVX_Vector x2, y, ypower2, temp;
+
+    x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
+    x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
+
+    y = Q6_Vw_vasr_VwR(in_vec, 1);
+    y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
+
+    // 1st iteration
+    ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
+
+    // 2nd iteration
+    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
+    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
+
+    // 3rd iteration
+    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
+    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
+
+    return Q6_Vsf_equals_Vqf32(temp);
+}
+
+#endif /* HVX_SQRT_H */
diff --git a/src/ggml-hexagon/htp/hvx-types.h b/src/ggml-hexagon/htp/hvx-types.h
new file mode 100644
index 00000000..d495a59f
--- /dev/null
+++ b/src/ggml-hexagon/htp/hvx-types.h
@@ -0,0 +1,36 @@
+#ifndef HVX_TYPES_H
+#define HVX_TYPES_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <hexagon_types.h>
+
+#define SIZEOF_FP32 (4)
+#define SIZEOF_FP16 (2)
+#define VLEN        (128)
+#define VLEN_FP32   (VLEN / SIZEOF_FP32)
+#define VLEN_FP16   (VLEN / SIZEOF_FP16)
+
+typedef union {
+    HVX_Vector v;
+    uint8_t    b[VLEN];
+    uint16_t   h[VLEN_FP16];
+    uint32_t   w[VLEN_FP32];
+    __fp16     fp16[VLEN_FP16];
+    float      fp32[VLEN_FP32];
+} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
+
+typedef struct {
+    HVX_Vector v[2];
+} HVX_Vector_x2;
+
+typedef struct {
+    HVX_Vector v[4];
+} HVX_Vector_x4;
+
+typedef struct {
+    HVX_Vector v[8];
+} HVX_Vector_x8;
+
+#endif /* HVX_TYPES_H */
diff --git a/src/ggml-hexagon/htp/hvx-utils.c b/src/ggml-hexagon/htp/hvx-utils.c
deleted file mode 100644
index 29d73b86..00000000
--- a/src/ggml-hexagon/htp/hvx-utils.c
+++ /dev/null
@@ -1,1020 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
-
-#include <HAP_farf.h>
-#include <HAP_mem.h>
-#include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
-#include <math.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "hvx-utils.h"
-
-#define htp_binary_ops_preamble                                                                                \
-    int step_of_4 = num_elems >> 7;                                                                            \
-    int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6;                                              \
-    int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5;                  \
-    int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \
-                                                                                                               \
-    const uint8_t * restrict src0_curr = src0;                                                                 \
-    const uint8_t * restrict src1_curr = src1;                                                                 \
-    uint8_t * restrict dst_curr        = dst;
-
-void hvx_mul_f32(const uint8_t * restrict src0,
-                 const uint8_t * restrict src1,
-                 uint8_t * restrict dst,
-                 const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-
-    bool handled_leftover = false;
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
-        }
-    } else {
-        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-        int leftover_size = left_over * sizeof(float);
-
-
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
-        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
-
-        HVX_Vector slinep;
-        HVX_Vector slinec;
-        HVX_Vector sline;
-        HVX_Vector sline2p;
-        HVX_Vector sline2c;
-        HVX_Vector sline2;
-
-        slinep  = *vec_in1++;
-        sline2p = *vec_in2++;
-        #pragma unroll(4)
-        for (int i = step_of_1 - 1; i > 0; i--) {
-            slinec  = *vec_in1++;
-            sline2c = *vec_in2++;
-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
-        }
-        if (step_of_1 > 1) {
-            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
-            sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
-
-            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
-        }
-        if (left_over > 0) {
-            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
-
-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-
-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
-            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
-            handled_leftover = true;
-        }
-    }
-
-
-    if (left_over > 0 && !handled_leftover) {
-        const float * src0f = (const float *) src0 + num_elems_whole;
-        const float * src1f = (const float *) src1 + num_elems_whole;
-        float *       dstf  = (float *) dst + num_elems_whole;
-
-        HVX_Vector in1 = *(HVX_UVector *) src0f;
-        HVX_Vector in2 = *(HVX_UVector *) src1f;
-
-        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    }
-}
-
-void hvx_mul_f32_opt(const uint8_t * restrict src0,
-                     const uint8_t * restrict src1,
-                     uint8_t * restrict dst,
-                     const int num_elems) {
-    htp_binary_ops_preamble;
-
-    for (int i = 0; i < step_of_4; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
-
-        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
-
-        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
-
-        src0_curr += 4 * VLEN;
-
-        HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b);
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
-
-        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
-
-        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b);
-
-        src1_curr += 4 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
-
-        dst_curr += 4 * VLEN;
-    }
-
-    for (int i = 0; i < step_of_2; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        src0_curr += 2 * VLEN;
-
-        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
-
-        src1_curr += 2 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        dst_curr += 2 * VLEN;
-    }
-
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector va = *(HVX_Vector *) src0_curr;
-
-        src0_curr += VLEN;
-
-        HVX_Vector vb = *(HVX_Vector *) src1_curr;
-
-        src1_curr += VLEN;
-
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
-
-        dst_curr += VLEN;
-    }
-
-    if (remaining > 0) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
-        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
-                         const uint8_t * restrict src1,
-                         const uint8_t * restrict src2,
-                         uint8_t * restrict dst,
-                         const int num_elems) {
-    const uint8_t * restrict src0_curr = src0;
-    const uint8_t * restrict src1_curr = src1;
-    const uint8_t * restrict src2_curr = src2;
-    uint8_t * restrict dst_curr        = dst;
-
-    int step_of_2 = num_elems >> 6;
-    int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5;
-    int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32;
-
-    for (int i = 0; i < step_of_2; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-        HVX_Vector v1c = *(HVX_Vector *) src2_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
-        HVX_Vector v1  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN);
-
-        src0_curr += 2 * VLEN;
-
-        HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
-        HVX_Vector v2  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c);
-
-        src1_curr += 2 * VLEN;
-        src2_curr += 2 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        dst_curr += 2 * VLEN;
-    }
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector va = *(HVX_Vector *) src0_curr;
-        src0_curr += VLEN;
-
-        HVX_Vector vb = *(HVX_Vector *) src1_curr;
-        src1_curr += VLEN;
-
-        HVX_Vector vc = *(HVX_Vector *) src2_curr;
-        src2_curr += VLEN;
-
-        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb);
-        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2);
-        dst_curr += VLEN;
-    }
-    if (remaining > 0) {
-        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
-        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr);
-        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2));
-    }
-}
-
-void hvx_add_f32(const uint8_t * restrict src0,
-                 const uint8_t * restrict src1,
-                 uint8_t * restrict dst,
-                 const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
-
-            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
-
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * src0f = (const float *) src0 + num_elems_whole;
-        const float * src1f = (const float *) src1 + num_elems_whole;
-        float *       dstf  = (float *) dst + num_elems_whole;
-
-        HVX_Vector in1 = *(HVX_UVector *) src0f;
-        HVX_Vector in2 = *(HVX_UVector *) src1f;
-
-        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    }
-}
-
-void hvx_add_f32_opt(const uint8_t * restrict src0,
-                     const uint8_t * restrict src1,
-                     uint8_t * restrict dst,
-                     const int num_elems) {
-    htp_binary_ops_preamble;
-
-    for (int i = 0; i < step_of_4; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
-
-        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
-
-        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
-
-        src0_curr += 4 * VLEN;
-
-        HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b);
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
-
-        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
-
-        HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b);
-
-        src1_curr += 4 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
-
-        dst_curr += 4 * VLEN;
-    }
-    for (int i = 0; i < step_of_2; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        src0_curr += 2 * VLEN;
-
-        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
-
-        src1_curr += 2 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        dst_curr += 2 * VLEN;
-    }
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector va = *(HVX_Vector *) src0_curr;
-
-        src0_curr += VLEN;
-
-        HVX_Vector vb = *(HVX_Vector *) src1_curr;
-
-        src1_curr += VLEN;
-
-        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
-
-        dst_curr += VLEN;
-    }
-    if (remaining > 0) {
-        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
-        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    static const float kInf    = INFINITY;
-    const HVX_Vector   inf     = hvx_vec_splat_fp32(kInf);
-    HVX_Vector         val_vec = hvx_vec_splat_fp32(val);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector           in       = *vec_in1++;
-            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
-            HVX_Vector           v        = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-            v                             = Q6_Vsf_equals_Vqf32(v);
-            v                             = Q6_V_vmux_QVV(pred_inf, inf, v);
-            *vec_out++                    = v;
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
-            HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-            out                           = Q6_Vsf_equals_Vqf32(out);
-            out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
-
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
-        HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-        out                           = Q6_Vsf_equals_Vqf32(out);
-        out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
-    }
-}
-
-void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-    bool handled_leftover = false;
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
-        }
-    } else {
-        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-        int leftover_size = left_over * sizeof(float);
-
-        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
-        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
-
-        HVX_Vector slinep;
-        HVX_Vector slinec;
-        HVX_Vector sline;
-
-        slinep = *input_v_ptr++;
-
-        #pragma unroll(4)
-        for (int i = step_of_1 - 1; i > 0; i--) {
-            slinec                              = *input_v_ptr++;
-            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-            /* Prepare slinep for next iteration */
-            slinep                              = slinec;
-        }
-
-        if (step_of_1 > 0) {
-            slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
-            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-
-            slinep = slinec;
-        }
-
-        if (leftover_size > 0) {
-            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
-
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-
-            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-            hvx_vec_store_u(output_v_ptr, leftover_size, sout);
-            handled_leftover = true;
-        }
-    }
-
-    if (left_over > 0 && !handled_leftover) {
-        const float * srcf = (const float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    }
-}
-
-void hvx_sub_f32(const uint8_t * restrict src0,
-                 const uint8_t * restrict src1,
-                 uint8_t * restrict dst,
-                 const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
-
-            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
-
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * src0f = (const float *) src0 + num_elems_whole;
-        const float * src1f = (const float *) src1 + num_elems_whole;
-        float *       dstf  = (float *) dst + num_elems_whole;
-
-        HVX_Vector in1 = *(HVX_UVector *) src0f;
-        HVX_Vector in2 = *(HVX_UVector *) src1f;
-
-        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    }
-}
-
-void hvx_sub_f32_opt(const uint8_t * restrict src0,
-                     const uint8_t * restrict src1,
-                     uint8_t * restrict dst,
-                     const int num_elems) {
-    htp_binary_ops_preamble;
-
-    for (int i = 0; i < step_of_4; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
-
-        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
-
-        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
-
-        src0_curr += 4 * VLEN;
-
-        HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b);
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
-
-        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
-
-        HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b);
-
-        src1_curr += 4 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
-
-        dst_curr += 4 * VLEN;
-    }
-    for (int i = 0; i < step_of_2; i++) {
-        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
-
-        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
-
-        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
-
-        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
-
-        src0_curr += 2 * VLEN;
-
-        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
-
-        src1_curr += 2 * VLEN;
-
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
-
-        dst_curr += 2 * VLEN;
-    }
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector va = *(HVX_Vector *) src0_curr;
-
-        src0_curr += VLEN;
-
-        HVX_Vector vb = *(HVX_Vector *) src1_curr;
-
-        src1_curr += VLEN;
-
-        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb);
-
-        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
-
-        dst_curr += VLEN;
-    }
-    if (remaining > 0) {
-        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
-        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
-
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    }
-}
-
-float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
-    }
-
-    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
-    HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
-
-    HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
-    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
-
-    #pragma unroll(4)
-    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
-        sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
-        vec_in1++;
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-
-        HVX_Vector vec_left = *(HVX_UVector *) srcf;
-
-        HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left);
-        HVX_Vector vec_tmp     = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32);
-
-        sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp);
-    }
-
-    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc);
-    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
-}
-
-float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
-    HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * vec_in = (HVX_Vector *) src;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
-            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-
-        HVX_Vector vec_left = *(HVX_UVector *) srcf;
-        HVX_Vector vec_tmp  = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32);
-        // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp);
-        sum_vec             = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp);
-    }
-
-    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec);
-    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
-}
-
-float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
-        unaligned_addr = 1;
-    }
-
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
-    }
-
-    HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
-    HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * restrict vec_in = (HVX_Vector *) src;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in);
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32);
-        vec_max         = Q6_Vsf_vmax_VsfVsf(vec_max, temp);
-    }
-
-    HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max);
-    return hvx_vec_get_fp32(v);
-}
-
-void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-    int unalign_address = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
-        unalign_address = 1;
-    }
-
-    const float * src_f = (const float *) src;
-
-    HVX_Vector vec_min = hvx_vec_splat_fp32(val);
-
-    if(unalign_address == 0){
-        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector min_clamp    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
-            *vec_out++ = (min_clamp);
-        }
-    }else{
-        HVX_UVector * restrict vec_in  = (HVX_Vector *) src;
-        HVX_UVector * restrict vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector min_clamp     = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
-            *vec_out++ = (min_clamp);
-        }
-    }
-
-    if (left_over > 0 ) {
-        const float * srcf = (const float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_UVector in = *(HVX_UVector *) srcf;
-
-        HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in);
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp));
-    }
-}
-
-void hvx_clamp_scalar_f32(const uint8_t * restrict src,
-                          const float limit_left,
-                          const float limit_right,
-                          uint8_t * restrict dst,
-                          const int num_elems) {
-    size_t left_over       = num_elems & (VLEN_FP32 - 1);
-    size_t num_elems_whole = num_elems - left_over;
-
-    int unalign_address = 0;
-    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
-        unalign_address = 1;
-    }
-
-    HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
-    HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
-
-    if(unalign_address == 0){
-        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
-
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in_vec = *vec_in++;
-            HVX_Vector temp_v = in_vec;
-
-            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
-            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
-
-            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
-
-            *vec_out++ = in_vec;
-        }
-
-    }else{
-
-        HVX_UVector * restrict vec_in  = (HVX_UVector *) src;
-        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in_vec = *vec_in++;
-            HVX_Vector temp_v = in_vec;
-
-            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
-            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
-
-            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
-
-            *vec_out++ = in_vec;
-        }
-
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (const float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in_vec = *(HVX_UVector *) srcf;
-
-        HVX_Vector temp_v = in_vec;
-
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
-
-        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
-    }
-}
-
-
diff --git a/src/ggml-hexagon/htp/hvx-utils.h b/src/ggml-hexagon/htp/hvx-utils.h
index 22876e6d..7b79a5ea 100644
--- a/src/ggml-hexagon/htp/hvx-utils.h
+++ b/src/ggml-hexagon/htp/hvx-utils.h
@@ -1,1353 +1,17 @@
 #ifndef HVX_UTILS_H
 #define HVX_UTILS_H
 
-#include "ops-utils.h"
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#define SIZEOF_FP32 (4)
-#define SIZEOF_FP16 (2)
-#define VLEN        (128)
-#define VLEN_FP32   (VLEN / SIZEOF_FP32)
-#define VLEN_FP16   (VLEN / SIZEOF_FP16)
-
-typedef union {
-    HVX_Vector v;
-    uint8_t    b[VLEN];
-    uint16_t   h[VLEN_FP16];
-    uint32_t   w[VLEN_FP32];
-    __fp16     fp16[VLEN_FP16];
-    float      fp32[VLEN_FP32];
-} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
-
-/* Q6_Vsf_equals_Vw is only available on v73+.*/
-#if __HVX_ARCH__ < 73
-static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
-{
-    HVX_Vector const vzero = Q6_V_vzero();
-    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
-    return ret;
-}
-
-static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
-{
-    return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
-}
-#endif
-
-static inline HVX_Vector hvx_vec_splat_fp32(float v) {
-    union {
-        float    f;
-        uint32_t i;
-    } fp32 = { .f = v };
-
-    return Q6_V_vsplat_R(fp32.i);
-}
-
-static inline HVX_Vector hvx_vec_splat_fp16(float v) {
-    union {
-        __fp16   f;
-        uint16_t i;
-    } fp16 = { .f = v };
-
-    return Q6_Vh_vsplat_R(fp16.i);
-}
-
-static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
-    // Rotate as needed.
-    v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
-
-    uint32_t left_off  = (size_t) addr & 127;
-    uint32_t right_off = left_off + n;
-
-    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
-    HVX_VectorPred qr     = Q6_Q_vsetq2_R(right_off);
-
-    if (right_off > 128) {
-        Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v);
-        // all 1's
-        qr = Q6_Q_vcmp_eq_VbVb(v, v);
-    }
-
-    ql_not = Q6_Q_or_QQn(ql_not, qr);
-    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v);
-}
-
-static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) {
-    assert((unsigned long) ptr % 128 == 0);
-
-    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr);
-    HVX_VectorPred qr     = Q6_Q_vsetq2_R(n);
-    ql_not                = Q6_Q_or_QQn(ql_not, qr);
-    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v);
-}
-
-static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
-    // vdelta control to replicate first 4 bytes across all elements
-    static const uint8_t __attribute__((aligned(128))) repl[128] = {
-        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-    };
-
-    HVX_Vector ctrl = *(HVX_Vector *) repl;
-    return Q6_V_vdelta_VV(v, ctrl);
-}
-
-// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
-static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
-    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
-
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
-    }
-}
-
-// copy n fp16 elements : source is aligned, destination is potentially unaligned
-static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
-    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
-
-    assert((unsigned long) src % 128 == 0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
-    }
-}
-
-// copy n fp16 elements : source is aligned, destination is potentially unaligned
-static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
-    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
-
-    assert((unsigned long) dst % 128 == 0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
-    }
-}
-
-// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
-static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
-    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
-
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-
-    uint32_t nvec = n / 32;
-    uint32_t nloe = n % 32;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
-    }
-}
-
-// copy n fp32 elements : source is aligned, destination is unaligned
-static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
-    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
-
-    assert((unsigned long) src % 128 == 0);
-
-    uint32_t nvec = n / 32;
-    uint32_t nloe = n % 32;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
-    }
-}
-
-// copy n fp32 elements : source is unaligned, destination is aligned
-static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
-    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
-
-    assert((unsigned long) dst % 128 == 0);
-
-    uint32_t nvec = n / 32;
-    uint32_t nloe = n % 32;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
-    }
-}
-
-// copy n fp32 elements : source is unaligned, destination unaligned
-static inline void hvx_copy_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
-    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
-
-    assert((unsigned long) dst % 128 == 0);
-
-    uint32_t nvec = n / 32;
-    uint32_t nloe = n % 32;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        HVX_Vector v = vsrc[i];
-        vdst[i]      = v;
-    }
-
-    if (nloe) {
-        HVX_Vector v = vsrc[i];
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
-    }
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
-static inline void hvx_copy_fp16_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
-    HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
-
-    const HVX_Vector zero = Q6_V_vsplat_R(0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
-    }
-
-    if (nloe) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
-    }
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
-static inline void hvx_copy_fp16_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
-    HVX_Vector  * restrict vsrc = (HVX_Vector *)  src; // fp32
-
-    const HVX_Vector zero = Q6_V_vsplat_R(0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
-    }
-
-    if (nloe) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
-    }
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
-static inline void hvx_copy_fp16_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    HVX_Vector  * restrict vdst = (HVX_Vector *)  dst; // fp16
-    HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
-
-    const HVX_Vector zero = Q6_V_vsplat_R(0);
-
-    uint32_t nvec = n / 64;
-    uint32_t nloe = n % 64;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
-    }
-
-    if (nloe) {
-        // Load y (fp32) and convert into fp16
-        HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
-        HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
-        HVX_Vector s_hf  = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
-    }
-}
-
-// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
-static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
-    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
-
-    HVX_Vector velem = hvx_vec_splat_fp32(elem);
-
-    assert((unsigned long) dst % 128 == 0);
-
-    uint32_t nvec = n / 32;
-    uint32_t nloe = n % 32;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (; i < nvec; i++) {
-        vdst[i] = velem;
-    }
-
-    if (nloe) {
-        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem);
-    }
-}
-
-
-/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
-static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
-    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
-    HVX_VectorAlias u = { .v = v };
-
-    const uint32_t n0 = n / 16;
-    const uint32_t n1 = n % 16;
-    int            i  = 0;
-    for (; i < n0; i++) {
-        htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
-    }
-    if (n1) {
-        htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
-    }
-}
-
-static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) {
-    hvx_vec_dump_fp16_n(pref, v, 64);
-}
-
-static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) {
-    union {
-        HVX_Vector v;
-        float      d[32];
-    } u = { .v = v };
-
-    const uint32_t n0 = n / 16;
-    const uint32_t n1 = n % 16;
-    int            i  = 0;
-    for (; i < n0; i++) {
-        htp_dump_fp32_line(pref, u.d + (16 * i), 16);
-    }
-    if (n1) {
-        htp_dump_fp32_line(pref, u.d + (16 * i), n1);
-    }
-}
-
-static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        float      d[32];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ...  %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
-         u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
-}
-
-static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) {
-    hvx_vec_dump_fp32_n(pref, v, 32);
-}
-
-static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int32_t    d[32];
-    } u = { .v = v };
-
-    for (int i = 0; i < 32 / 16; i++) {
-        htp_dump_int32_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int32_t    d[32];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
-         u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
-}
-
-static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
-         u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
-}
-
-static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } u = { .v = v };
-
-    for (int i = 0; i < 128 / 16; i++) {
-        htp_dump_int8_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        uint8_t    d[128];
-    } u = { .v = v };
-
-    for (int i = 0; i < 128 / 16; i++) {
-        htp_dump_uint8_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
-    typedef union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } U;
-
-    U u0 = { .v = v0 };
-    U u1 = { .v = v1 };
-
-    for (int i = 0; i < n; i++) {
-        if (u0.d[i] != u1.d[i]) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static inline float hvx_vec_get_fp32(HVX_Vector v) {
-    float __attribute__((aligned(128))) x;
-    hvx_vec_store_a(&x, 4, v);
-    return x;
-}
-
-static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // int32
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);     // rotate right
-        sum   = Q6_Vw_vadd_VwVw(sum_t, sum);  // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) {
-    return hvx_vec_int32_reduce_sum_n(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // fp32 nbytes
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width);  // rotate right
-        sum   = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t);             // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
-    return hvx_vec_qf32_reduce_sum_n(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // fp32 nbytes
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);                               // rotate right
-        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t));  // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) {
-    return hvx_vec_fp32_reduce_sum_n(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 2;    // fp16 nbytes
-
-    HVX_Vector _max = in, _max_t;
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 2;    // fp32 nbytes
-
-    HVX_Vector _max_t;
-
-    _max = Q6_Vhf_vmax_VhfVhf(in, _max);
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 4;    // fp32 nbytes
-
-    HVX_Vector _max = in, _max_t;
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 4;    // fp32 nbytes
-
-    HVX_Vector _max_t;
-
-    _max = Q6_Vsf_vmax_VsfVsf(in, _max);
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
-    // abs by clearing the fp16 sign bit
-    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
-    return Q6_V_vand_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
-    // neg by setting the fp16 sign bit
-    HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
-    return Q6_V_vxor_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
-    // abs by clearing the fp32 sign bit
-    HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
-    return Q6_V_vand_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
-#if __HVX_ARCH__ > 75
-    return Q6_Vsf_vfneg_Vsf(v);
-#else
-    // neg by setting the fp32 sign bit
-    HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
-    return Q6_V_vxor_VV(v, mask);
-#endif  // __HVX_ARCH__ > 75
-}
-
-// ====================================================
-// FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
-// Order:3; continuity: True; Ends forced: True
-// Mode: unsigned;   Result fractional bits: 14
-// Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
-//      32769  -32706   31252  -10589
-//      32590  -30635   22793   -4493
-//      32066  -27505   16481   -2348
-//      31205  -24054   11849   -1306
-
-static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
-    // input is 0..0xffff representing 0.0  .. 1.0
-    HVX_Vector p;
-    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
-    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
-    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
-    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
-    return p;  // signed result, 14 fractional bits
-}
-
-// Find reciprocal of fp16.
-// (1) first, convert to fp32, multiplying by 1.0; this is done to
-//    handle denormals. Ignoring sign and zero, result should be at
-//    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
-//    (exponent in range [103,143])
-// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
-// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
-// (4) convert that to fp16
-// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
-//     the result with the max value.
-static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) {
-    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
-    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
-    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
-    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
-    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
-
-    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
-    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
-    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
-
-    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
-    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
-    // likewise extract the upper 16 from each, containing the exponents in range 103..142
-    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
-    //Get exponent in IEEE 32-bit representation
-    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
-
-    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
-    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
-    // Use poly to transform to 1/x, with 14 fractional bits
-    //
-    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
-
-    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
-
-    // Get mantissa for 16-bit represenation
-    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
-
-    //Compute Reciprocal Exponent
-    HVX_Vector exp_recip =
-        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
-    //Convert it for 16-bit representation
-    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
-    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
-
-    //Merge exponent and mantissa for reciprocal
-    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
-    // map 'small' inputs to standard largest value 0x7bff
-    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
-    // add sign back
-    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
-    return recip;
-}
-
-#define IEEE_VSF_EXPLEN   (8)
-#define IEEE_VSF_EXPBIAS  (127)
-#define IEEE_VSF_EXPMASK  (0xFF)
-#define IEEE_VSF_MANTLEN  (23)
-#define IEEE_VSF_MANTMASK (0x7FFFFF)
-#define IEEE_VSF_MIMPMASK (0x800000)
-
-static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) {
-    HVX_Vector mask_mant_v  = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
-    HVX_Vector mask_impl_v  = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
-    HVX_Vector const_zero_v = Q6_V_vzero();
-
-    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
-
-    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
-    expval_v &= IEEE_VSF_EXPMASK;
-    expval_v -= IEEE_VSF_EXPBIAS;
-
-    // negative exp == fractional value
-    HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
-
-    HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v;         // fractional bits - exp shift
-
-    HVX_Vector mant_v = in_vec & mask_mant_v;                  // obtain mantissa
-    HVX_Vector vout   = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v);  // add implicit 1.0
-
-    vout = Q6_Vw_vasr_VwVw(vout, rshift_v);                    // shift to obtain truncated integer
-    vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout);        // expval<0 -> 0
-
-    HVX_Vector neg_vout = -vout;
-
-    vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout);  // handle negatives
-
-    return (vout);
-}
-
-static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) {
-    HVX_Vector mask_mant_v    = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
-    HVX_Vector mask_impl_v    = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
-    HVX_Vector const_mnlen_v  = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
-    HVX_Vector const_zero_v   = Q6_V_vzero();
-    HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000);  // -1 IEEE vsf
-
-    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
-
-    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
-    expval_v &= IEEE_VSF_EXPMASK;
-    expval_v -= IEEE_VSF_EXPBIAS;
-
-    HVX_VectorPred q_negexp     = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
-    HVX_VectorPred q_expltmn    = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
-    HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
-    HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
-
-    // if expval < 0 (q_negexp)         // <0, floor is 0
-    //    if vin > 0
-    //       floor = 0
-    //    if vin < 0
-    //       floor = -1
-    // if expval < mant_len (q_expltmn) // >0, but fraction may exist
-    //    get sign (q_negative)
-    //    mask >> expval                // fraction bits to mask off
-    //    vout = ~(mask)                // apply mask to remove fraction
-    //    if (qneg)                     // negative floor is one less (more, sign bit for neg)
-    //      vout += ((impl_mask) >> expval)
-    //    if (mask && vin)
-    //      vout = vin
-    // else                             // already an integer
-    //    ;                             // no change
-
-    // compute floor
-    mask_mant_v >>= expval_v;
-    HVX_Vector neg_addin_v    = mask_impl_v >> expval_v;
-    HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
-    HVX_Vector vout           = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
-
-    HVX_Vector     mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v);  // chk if bits set
-    HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
-
-    HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v);        // frac bits to clear
-    HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v);  // clear frac bits
-
-    vout = in_vec;
-    vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout);         // expval<mant
-    vout = Q6_V_vmux_QVV(q_integral, in_vec, vout);            // integral values
-    vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout);    // expval<0 x>0 -> 0
-    vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout);  // expval<0 x<0 -> -1
-
-    return vout;
-}
-
-static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
-    // This looks complicated.
-    // Ideally should just be Q6_Vh_equals_Vhf(vin)
-    // but that instruction does not do proper rounding.
-
-    // convert to qf32, multiplying by 1.0 in the process.
-    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
-
-    // 'in-range' values are +/32752.
-    // add 192K to it, convert to sf
-    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
-    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
-    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
-
-    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
-    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
-    // Start by <<10 to get the final 'sign' bit in bit 15...
-    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
-    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
-
-    // now round down to 16
-    return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
-}
-
-static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
-    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
-    HVX_Vector two_sf       = hvx_vec_splat_fp32(2.0);
-
-    // First approximation
-    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
-
-    HVX_Vector r_qf;
-
-    // Refine
-    r_qf = Q6_Vqf32_vmpy_VsfVsf(
-        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
-    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
-        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
-    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
-        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
-
-    return Q6_Vsf_equals_Vqf32(r_qf);
-}
-
-#define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
-#define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
-#define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
-#define FAST_SIGMOID_C3    (0x3f000000)  // 0.5
-
-static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
-    v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
-    v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
-
-    HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v));
-    HVX_Vector x      = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
-    HVX_Vector xx     = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
-
-    HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
-    v1            = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
-
-    HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
-    v2            = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
-    v2            = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
-
-    HVX_Vector v3          = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
-    HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
-    v3_exponent            = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
-    v3_exponent            = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
-    v3                     = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
-
-    HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
-    HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
-
-    HVX_Vector res = hvx_vec_inverse_fp32(v5);
-    res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
-
-    return Q6_Vsf_equals_Vqf32(res);
-}
-
-#define EXP_COEFF_5 (0x39506967)  // 0.000198757 = 1/(7!)
-#define EXP_COEFF_4 (0x3AB743CE)  // 0.0013982   = 1/(6!)
-#define EXP_COEFF_3 (0x3C088908)  // 0.00833345  = 1/(5!)
-#define EXP_COEFF_2 (0x3D2AA9C1)  // 0.416658    = 1/(4!)
-#define EXP_COEFF_1 (0x3E2AAAAA)  // 0.16666667  = 1/(3!)
-#define EXP_COEFF_0 (0x3F000000)  // 0.5         = 1/(2!)
-#define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
-#define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
-#define EXP_ONE     (0x3f800000)  // 1.0
-#define EXP_RANGE_R (0x41a00000)  // 20.0
-#define EXP_RANGE_L (0xc1a00000)  // -20.0
-
-static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) {
-    HVX_Vector z_qf32_v;
-    HVX_Vector x_v;
-    HVX_Vector x_qf32_v;
-    HVX_Vector y_v;
-    HVX_Vector k_v;
-    HVX_Vector f_v;
-    HVX_Vector epsilon_v;
-    HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
-    HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
-    HVX_Vector E_const;
-    HVX_Vector zero_v = Q6_V_vzero();
-
-    // exp(x) is approximated as follows:
-    //   f = floor(x/ln(2)) = floor(x*log2(e))
-    //   epsilon = x - f*ln(2)
-    //   exp(x) = exp(epsilon+f*ln(2))
-    //          = exp(epsilon)*exp(f*ln(2))
-    //          = exp(epsilon)*2^f
-    //
-    //   Since epsilon is close to zero, it can be approximated with its Taylor series:
-    //            exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
-    //   Preserving the first eight elements, we get:
-    //            exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
-    //                   =  1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
-
-    HVX_Vector temp_v = in_vec;
-
-    // Clamp inputs to (-20.0, 20.0)
-    HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
-    HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
-
-    in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
-    in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
-
-    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
-    epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
-
-    //    f_v is the floating point result and k_v is the integer result
-    f_v = hvx_vec_floor_fp32(epsilon_v);
-    k_v = hvx_vec_truncate_fp32(f_v);
-
-    x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
-
-    //  x = x - f_v * logn2;
-    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
-    x_qf32_v  = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
-    // normalize before every QFloat's vmpy
-    x_qf32_v  = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
-
-    // z = x * x;
-    z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
-    z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
-
-    x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
-
-    // y = E4 + E5 * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_5);
-    y_v     = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
-    E_const = Q6_V_vsplat_R(EXP_COEFF_4);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E3 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_3);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E2 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_2);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E1 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_1);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E0 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_0);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = x + y * z;
-    y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
-    y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = y + 1.0;
-    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
-
-    // insert exponents
-    //        y = ldexpf(y, k);
-    //    y_v += k_v; // qf32
-    // modify exponent
-
-    y_v = Q6_Vsf_equals_Vqf32(y_v);
-
-    // add k_v to the exponent of y_v
-    HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
-
-    y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
-    y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
-
-    // exponent cannot be negative; if overflow is detected, result is set to zero
-    HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
-
-    y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
-
-    y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
-
-    return y_v;
-}
-
-#define RSQRT_CONST        0x5f3759df  // Constant for fast inverse square root calculation
-#define RSQRT_ONE_HALF     0x3f000000  // 0.5
-#define RSQRT_THREE_HALVES 0x3fc00000  // 1.5
-
-static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
-    //Algorithm :
-    //  x2 = input*0.5
-    //  y  = * (long *) &input
-    //  y  = 0x5f3759df - (y>>2)
-    //  y  = y*(threehalfs - x2*y*y)
-
-    HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
-    HVX_Vector onehalf    = Q6_V_vsplat_R(RSQRT_ONE_HALF);
-    HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
-
-    HVX_Vector x2, y, ypower2, temp;
-
-    x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
-    x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
-
-    y = Q6_Vw_vasr_VwR(in_vec, 1);
-    y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
-
-    // 1st iteration
-    ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
-
-    // 2nd iteration
-    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
-    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
-
-    // 3rd iteration
-    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
-    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
-
-    return Q6_Vsf_equals_Vqf32(temp);
-}
-
-static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
-                                                         HVX_Vector one,
-                                                         HVX_Vector max_exp,
-                                                         HVX_Vector min_exp) {
-    const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
-    const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
-
-    HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
-    out            = Q6_V_vmux_QVV(pred_max, out, one);
-    return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
-}
-
-static inline HVX_Vector hvx_vec_tanh_fp32(HVX_Vector x) {
-    // tanh(x) = 2 * sigmoid(2x) - 1
-    HVX_Vector two = hvx_vec_splat_fp32(2.0f);
-    HVX_Vector one = hvx_vec_splat_fp32(1.0f);
-    HVX_Vector x2  = Q6_Vqf32_vmpy_VsfVsf(x, two);
-
-    static const float kMinExp = -87.f;  // 0
-    static const float kMaxExp = 87.f;   // 1
-    HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-    HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
-
-    HVX_Vector sig2x = hvx_vec_fast_sigmoid_fp32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
-
-    HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
-    res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
-    return Q6_Vsf_equals_Vqf32(res);
-}
-
-static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
-    int step_of_1 = num_elems >> 5;
-    int remaining = num_elems - step_of_1 * VLEN_FP32;
-
-    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
-    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
-
-    static const float kMinExp = -87.f;  // 0
-    static const float kMaxExp = 87.f;   // 1
-
-    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
-    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
-
-    #pragma unroll(4)
-    for (int i = 0; i < step_of_1; i++) {
-        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
-    }
-
-    if (remaining > 0) {
-        const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
-        float *       dstf = (float *) dst + step_of_1*VLEN_FP32;
-
-        HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
-        hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
-    }
-}
-
-static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
-    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-    int leftover = num_elems - (step_of_1 * VLEN_FP32);
-
-    int32_t leftover_size = leftover * sizeof(float);
-
-    static const float kMinExp = -87.f;  // 0
-    static const float kMaxExp = 87.f;   // 1
-
-    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
-    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
-
-    const float *input = (float *)src;
-    float *output = (float *)dst;
-
-    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
-    HVX_UVector * output_v_ptr = (HVX_UVector *) output;
-
-    HVX_Vector slinep;
-    HVX_Vector slinec;
-    HVX_Vector sline;
-
-    slinep = *input_v_ptr++;
-    #pragma unroll(4)
-    for (int i = step_of_1 - 1; i > 0; i--) {
-        slinec                              = *input_v_ptr++;
-        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        /* Prepare slinep for next iteration */
-        slinep                              = slinec;
-    }
-
-    if (step_of_1 > 0) {
-        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
-        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        ;
-
-        slinep = slinec;
-    }
-    if (leftover > 0) {
-        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
-
-        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-
-        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        hvx_vec_store_u(output_v_ptr, leftover_size, sout);
-    }
-}
-
-static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    int nvec = n / VLEN_FP32;
-    int nloe = n % VLEN_FP32;
-
-    HVX_Vector vs = hvx_vec_splat_fp32(scale);
-
-    HVX_Vector * vsrc = (HVX_Vector *) src;
-    HVX_Vector * vdst = (HVX_Vector *) dst;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (i = 0; i < nvec; ++i) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        vdst[i]      = Q6_Vsf_equals_Vqf32(v);
-    }
-
-    if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    int nvec = n / VLEN_FP32;
-    int nloe = n % VLEN_FP32;
-
-    HVX_Vector vs = hvx_vec_splat_fp32(scale);
-
-    HVX_UVector * vsrc = (HVX_UVector *) src;
-    HVX_UVector * vdst = (HVX_UVector *) dst;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (i = 0; i < nvec; ++i) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        vdst[i]      = Q6_Vsf_equals_Vqf32(v);
-    }
-
-    if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
-        hvx_scale_f32_aa(dst, src, n, scale);
-    } else {
-        hvx_scale_f32_uu(dst, src, n, scale);
-    }
-}
-
-static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    int nvec = n / VLEN_FP32;
-    int nloe = n % VLEN_FP32;
-
-    HVX_Vector vs = hvx_vec_splat_fp32(scale);
-    HVX_Vector vo = hvx_vec_splat_fp32(offset);
-
-    HVX_Vector * vsrc = (HVX_Vector *) src;
-    HVX_Vector * vdst = (HVX_Vector *) dst;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (i = 0; i < nvec; ++i) {
-        HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
-        vdst[i] = Q6_Vsf_equals_Vqf32(v);
-    }
-
-    if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
-        hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    int nvec = n / VLEN_FP32;
-    int nloe = n % VLEN_FP32;
-
-    HVX_Vector vs = hvx_vec_splat_fp32(scale);
-    HVX_Vector vo = hvx_vec_splat_fp32(offset);
-
-    HVX_UVector * vsrc = (HVX_UVector *) src;
-    HVX_UVector * vdst = (HVX_UVector *) dst;
-
-    uint32_t i = 0;
-
-    #pragma unroll(4)
-    for (i = 0; i < nvec; ++i) {
-        HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
-        vdst[i] = Q6_Vsf_equals_Vqf32(v);
-    }
-
-    if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
-        hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
-    }
-}
-
-static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
-        hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
-    } else {
-        hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
-    }
-}
-
-float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
-void  hvx_mul_f32(const uint8_t * restrict src0,
-                  const uint8_t * restrict src1,
-                  uint8_t * restrict dst,
-                  const int num_elems);
-void  hvx_mul_f32_opt(const uint8_t * restrict src0,
-                      const uint8_t * restrict src1,
-                      uint8_t * restrict dst,
-                      const int num_elems);
-void  hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
-                          const uint8_t * restrict src1,
-                          const uint8_t * restrict src2,
-                          uint8_t * restrict dst,
-                          const int num_elems);
-void  hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
-void  hvx_add_f32(const uint8_t * restrict src0,
-                  const uint8_t * restrict src1,
-                  uint8_t * restrict dst,
-                  const int num_elems);
-void  hvx_add_f32_opt(const uint8_t * restrict src0,
-                      const uint8_t * restrict src1,
-                      uint8_t * restrict dst,
-                      const int num_elems);
-void  hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
-void  hvx_sub_f32(const uint8_t * restrict src0,
-                  const uint8_t * restrict src1,
-                  uint8_t * restrict dst,
-                  const int num_elems);
-void  hvx_sub_f32_opt(const uint8_t * restrict src0,
-                      const uint8_t * restrict src1,
-                      uint8_t * restrict dst,
-                      const int num_elems);
-void  hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
-void  hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
-void  hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
-void  hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
-float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems);
-float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems);
-void  hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
-void  hvx_clamp_scalar_f32(const uint8_t * restrict src,
-                           const float limit_left,
-                           const float limit_right,
-                           uint8_t * restrict dst,
-                           const int num_elems);
+#include "hex-utils.h"
+
+#include "hvx-types.h"
+#include "hvx-copy.h"
+#include "hvx-scale.h"
+#include "hvx-exp.h"
+#include "hvx-inverse.h"
+#include "hvx-reduce.h"
+#include "hvx-sigmoid.h"
+#include "hvx-sqrt.h"
+#include "hvx-arith.h"
+#include "hvx-base.h"
 
 #endif /* HVX_UTILS_H */
diff --git a/src/ggml-hexagon/htp/main.c b/src/ggml-hexagon/htp/main.c
index 24b3e90e..e28a67a9 100644
--- a/src/ggml-hexagon/htp/main.c
+++ b/src/ggml-hexagon/htp/main.c
@@ -1,17 +1,13 @@
 #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 #pragma clang diagnostic ignored "-Wunused-function"
 
-#define FARF_ERROR  1
-#define FARF_HIGH   1
-#define FARF_MEDIUM 0
-#define FARF_LOW    0
+#include <HAP_farf.h>
+#include <HAP_perf.h>
 #include <AEEStdErr.h>
 #include <dspqueue.h>
 #include <HAP_compute_res.h>
 #include <HAP_etm_config.h>
-#include <HAP_farf.h>
 #include <HAP_mem.h>
-#include <HAP_perf.h>
 #include <HAP_power.h>
 #include <HAP_ps.h>
 #include <qurt.h>
@@ -19,13 +15,14 @@
 #include <remote.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hex-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "ops-utils.h"
 #include "worker-pool.h"
 
 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -362,14 +359,14 @@ struct profile_data {
 
 static inline void profile_start(struct profile_data * d) {
     d->usecs  = HAP_perf_get_qtimer_count();
-    d->cycles = htp_get_cycles();
-    d->pkts   = htp_get_pktcnt();
+    d->cycles = hex_get_cycles();
+    d->pkts   = hex_get_pktcnt();
 }
 
 static inline void profile_stop(struct profile_data * d) {
     d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-    d->cycles = htp_get_cycles() - d->cycles;
-    d->pkts   = htp_get_pktcnt() - d->pkts;
+    d->cycles = hex_get_cycles() - d->cycles;
+    d->pkts   = hex_get_pktcnt() - d->pkts;
 }
 
 static int send_htp_rsp(struct htp_context *     c,
@@ -443,6 +440,43 @@ static void proc_matmul_req(struct htp_context *     ctx,
     send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
 }
 
+static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
+    struct dspqueue_buffer rsp_bufs[1];
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[0].fd     = bufs[1].fd;
+    rsp_bufs[0].ptr    = bufs[1].ptr;
+    rsp_bufs[0].offset = bufs[1].offset;
+    rsp_bufs[0].size   = bufs[1].size;
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.dst.data  = (uint32_t) bufs[1].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_cpy(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+}
+
 static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
     struct dspqueue_buffer rsp_bufs[1];
 
@@ -993,6 +1027,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                 proc_get_rows_req(ctx, &req, bufs);
                 break;
 
+            case HTP_OP_CPY:
+                if (n_bufs != 2) {
+                    FARF(ERROR, "Bad cpy-req buffer list");
+                    continue;
+                }
+                proc_cpy_req(ctx, &req, bufs);
+                break;
+
             default:
                 FARF(ERROR, "Unknown Op %u", req.op);
                 break;
diff --git a/src/ggml-hexagon/htp/matmul-ops.c b/src/ggml-hexagon/htp/matmul-ops.c
index 9bb39db9..1603ff2b 100644
--- a/src/ggml-hexagon/htp/matmul-ops.c
+++ b/src/ggml-hexagon/htp/matmul-ops.c
@@ -3,28 +3,20 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
-
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define MM_SPAD_SRC0_NROWS 16
 #define MM_SPAD_SRC1_NROWS 16
@@ -36,20 +28,8 @@ struct htp_matmul_type {
     void (*vec_dot_rx2)(const int n, float * restrict s, const void * restrict vx, uint32_t vx_row_size, const void * restrict vy);
 };
 
-typedef struct {
-    HVX_Vector v[2];
-} HVX_Vector_x2;
-
-typedef struct {
-    HVX_Vector v[4];
-} HVX_Vector_x4;
-
-typedef struct {
-    HVX_Vector v[8];
-} HVX_Vector_x8;
-
 // vdelta control to replicate first 4x fp32 values across lanes
-static const uint8_t __attribute__((aligned(128))) repl_4x_fp32[128] = {
+static const uint8_t __attribute__((aligned(128))) repl_4x_f32[128] = {
     0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
     0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
     0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04,
@@ -60,7 +40,7 @@ static const uint8_t __attribute__((aligned(128))) repl_4x_fp32[128] = {
 };
 
 // vdelta control to replicate and interleave first 8x fp32 values across lanes
-static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_fp32[128] = {
+static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_f32[128] = {
     0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00,
     0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
     0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04,
@@ -71,7 +51,7 @@ static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_fp32[128]
 };
 
 // vdelta control to replicate first fp32 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_1x_fp32[128] = {
+static const uint8_t __attribute__((aligned(128))) repl_1x_f32[128] = {
     0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
     0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
     0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08,
@@ -82,7 +62,7 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp32[128] = {
 };
 
 // vdelta control to replicate first fp16 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = {
+static const uint8_t __attribute__((aligned(128))) repl_1x_f16[128] = {
     0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02,
     0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04,
     0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08,
@@ -93,7 +73,7 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = {
 };
 
 // vdelta control to replicate first fp16 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_2x_fp16[128] = {
+static const uint8_t __attribute__((aligned(128))) repl_2x_f16[128] = {
     0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
     0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
     0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
@@ -129,7 +109,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) {
     // ensures perfect alignment of quants and full row
     const uint32_t qk = QK_Q8_0x4x2;
     const uint32_t nb = (ne + qk - 1) / qk;
-    return htp_round_up(ne + nb * 8 * sizeof(__fp16), 128);
+    return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
 }
 
 static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
@@ -389,7 +369,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -485,8 +465,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -562,7 +542,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -658,8 +638,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -768,7 +748,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -900,8 +880,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -933,7 +913,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
@@ -977,8 +957,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
         rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
     }
 
-    rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum0));
-    rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum1));
+    rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0));
+    rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -1010,7 +990,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
@@ -1062,7 +1042,7 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
@@ -1359,7 +1339,7 @@ static void matvec_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
         mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
     }
 
-    hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
+    hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
 
     t2 = HAP_perf_get_qtimer_count();
 
@@ -1411,7 +1391,7 @@ static void matmul_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
     const size_t src0_row_size = nb01;
     const size_t src1_row_size = q8x4x2_row_size(ne10);
 
-    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+    const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
 
     // Per-thread VTCM scratchpads for all tensors
     // Note that the entire src1 tensor is already in VTCM
@@ -1524,7 +1504,7 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
     const size_t src0_row_size = nb01;
     const size_t src1_row_size = q8x4x2_row_size(ne10);
 
-    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+    const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
 
     const uint32_t n_aids = src2->ne[0];  // num activated experts
     const uint32_t n_ids  = ne02;         // num experts
@@ -1590,7 +1570,7 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
 
 // *** dynamic quant
 
-static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
     assert((unsigned long) x % 128 == 0);
     assert((unsigned long) y_q % 128 == 0);
 
@@ -1598,10 +1578,10 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri
     HVX_Vector zero   = Q6_V_vsplat_R(0);
 
     // Use reduce max fp32 to find max(abs(e)) first
-    HVX_Vector vmax0_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[0]));
-    HVX_Vector vmax1_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[1]));
-    HVX_Vector vmax2_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[2]));
-    HVX_Vector vmax3_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[3]));
+    HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
+    HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1]));
+    HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2]));
+    HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3]));
     // Load and convert into QF32
     HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
     HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
@@ -1623,7 +1603,7 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri
     HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
 
     // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_fp16;
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_f16;
     vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
     vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
 
@@ -1641,8 +1621,8 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri
     hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf);
 
     // Divide input by the scale
-    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
-    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf);
     vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
     vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
 
@@ -1654,7 +1634,7 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri
     *(HVX_Vector *) y_q = vx_i8;
 }
 
-static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
     assert((unsigned long) x % 128 == 0);
     assert((unsigned long) y_q % 128 == 0);
 
@@ -1672,11 +1652,11 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri
     HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
 
     // Compute max and scale
-    HVX_Vector vmax01_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf));
-    HVX_Vector vmax23_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx23_hf));
+    HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
+    HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf));
 
     // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16;
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
     vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
     vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
 
@@ -1689,8 +1669,8 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri
     hvx_vec_store_u(y_d + 4, 4, vd23_hf);
 
     // Divide input by the scale
-    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
-    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf);
     vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
     vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
 
@@ -1702,7 +1682,7 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri
     *(HVX_Vector *) y_q = vx_i8;
 }
 
-static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
     assert((unsigned long) x % 128 == 0);
     assert((unsigned long) y_q % 128 == 0);
 
@@ -1720,11 +1700,11 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri
     HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
 
     // Compute max and scale
-    HVX_Vector vmax_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf));
-    vmax_hf            = hvx_vec_reduce_max2_fp16(hvx_vec_abs_fp16(vx23_hf), vmax_hf);
+    HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
+    vmax_hf            = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf);
 
     // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16;
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
     vmax_hf         = Q6_V_vdelta_VV(vmax_hf, ctrl);
 
     HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
@@ -1733,7 +1713,7 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri
     *(HVX_UVector *) y_d = vd_hf;
 
     // Divide input by the scale
-    HVX_Vector vd_inv_hf = hvx_vec_inverse_fp16(vd_hf);
+    HVX_Vector vd_inv_hf = hvx_vec_inverse_f16(vd_hf);
     vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf));
     vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf));
 
@@ -1746,7 +1726,7 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri
 }
 
 // Overrides input x
-static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
+static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
     assert(k % 32 == 0);
     const uint32_t qk = QK_Q8_0x4x2;
     const uint32_t nb = (k + qk - 1) / qk;
@@ -1764,24 +1744,24 @@ static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, u
 
     for (uint32_t i = 0; i < nb; i++) {
 #if FP32_QUANTIZE_GROUP_SIZE == 32
-        quantize_block_fp32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
-        quantize_block_fp32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
+        quantize_block_f32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
+        quantize_block_f32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
 #elif FP32_QUANTIZE_GROUP_SIZE == 64
-        quantize_block_fp32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
-        quantize_block_fp32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
+        quantize_block_f32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
+        quantize_block_f32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
 #elif FP32_QUANTIZE_GROUP_SIZE == 128
-        quantize_block_fp32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
-        quantize_block_fp32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
+        quantize_block_f32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
+        quantize_block_f32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
 #else
 #error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128"
 #endif
     }
 
     // now copy the scales into final location
-    hvx_copy_fp16_ua(y_d, t_d, nb * 8);
+    hvx_copy_f16_ua(y_d, t_d, nb * 8);
 }
 
-static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
+static void quantize_f32_q8x4x2(const struct htp_tensor * src,
                                  uint8_t * restrict dst,
                                  struct htp_spad * spad,
                                  uint32_t          nth,
@@ -1807,26 +1787,26 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
     uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first);
     uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith);
 
-    const size_t src_row_size_padded = htp_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float));
+    const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float));
     memset(tmp_data, 0, src_row_size_padded);  // zero-out temp row data for padding
 
     for (uint32_t i = ir_first; i < ir_last; ++i) {
-        htp_l2fetch(src_data, 2, src_row_size, src_row_size);
-        hvx_copy_fp32_aa(tmp_data, src_data, ne0);
+        hex_l2fetch(src_data, src_row_size, src_row_size, 2);
+        hvx_copy_f32_aa(tmp_data, src_data, ne0);
 
         // FARF(HIGH, "quantize-q8x4-row: %u\n", i);
-        quantize_row_fp32_q8x4x2((float *) tmp_data, dst_data, ne0);
+        quantize_row_f32_q8x4x2((float *) tmp_data, dst_data, ne0);
         dst_data += dst_row_size;
         src_data += src_row_size;
     }
 
     uint64_t t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
+    FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
          ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
+static void quantize_f32_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
                               uint32_t nrows_per_thread, uint32_t dst_stride) {
 
     uint64_t t1 = HAP_perf_get_qtimer_count();
@@ -1848,8 +1828,8 @@ static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict
     uint8_t * restrict dst_data = (uint8_t *) dst       + (dst_stride * ir_first);
 
     for (uint32_t i = ir_first; i < ir_last; ++i) {
-        htp_l2fetch(src_data, 2, src_row_size, src_stride);
-        hvx_copy_fp16_fp32_au(dst_data, src_data, ne0);
+        hex_l2fetch(src_data, src_row_size, src_stride, 2);
+        hvx_copy_f16_f32_au(dst_data, src_data, ne0);
 
         dst_data += dst_stride;
         src_data += src_stride;
@@ -1857,12 +1837,12 @@ static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict
 
     uint64_t t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "quantize-fp32-fp16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
+    FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // TODO just a plain copy that should be done via the DMA during the Op setup
-static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
+static void quantize_f16_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
                               uint32_t nrows_per_thread, uint32_t dst_stride) {
 
     uint64_t t1 = HAP_perf_get_qtimer_count();
@@ -1884,8 +1864,8 @@ static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict
     uint8_t * restrict dst_data = (uint8_t *) dst       + (dst_stride * ir_first);
 
     for (uint32_t i = ir_first; i < ir_last; ++i) {
-        htp_l2fetch(src_data, 2, src_row_size, src_stride);
-        hvx_copy_fp16_au(dst_data, src_data, ne0);
+        hex_l2fetch(src_data, src_row_size, src_stride, 2);
+        hvx_copy_f16_au(dst_data, src_data, ne0);
 
         dst_data += dst_stride;
         src_data += src_stride;
@@ -1893,23 +1873,23 @@ static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict
 
     uint64_t t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "quantize-fp16-fp16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
+    FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {
+static void htp_quantize_f32_q8x4x2(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = data;
-    quantize_fp32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread);
+    quantize_f32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread);
 }
 
-static void htp_quantize_fp32_fp16(unsigned int n, unsigned int i, void * data) {
+static void htp_quantize_f32_f16(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = data;
-    quantize_fp32_fp16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
+    quantize_f32_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
 }
 
-static void htp_quantize_fp16_fp16(unsigned int n, unsigned int i, void * data) {
+static void htp_quantize_f16_f16(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = data;
-    quantize_fp16_fp16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
+    quantize_f16_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
 }
 
 // ** matmul/matvec callbacks for worker_pool
@@ -2108,7 +2088,7 @@ int op_matmul(struct htp_ops_context * octx) {
     const size_t dst_row_size  = nb1;
     size_t       src1_row_size = nb11;
 
-    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+    const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
     size_t       src1_row_size_padded;
 
     worker_callback_t quant_job_func;
@@ -2118,8 +2098,8 @@ int op_matmul(struct htp_ops_context * octx) {
 
     switch (src0->type) {
         case HTP_TYPE_Q4_0:
-            op_type        = "q4x4x2-fp32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            op_type        = "q4x4x2-f32";
+            quant_job_func = htp_quantize_f32_q8x4x2;
             if (src1_nrows > 1) {
                 matmul_job_func = htp_matmul_2d_q4x4x2_q8x4x2;
             } else {
@@ -2131,12 +2111,12 @@ int op_matmul(struct htp_ops_context * octx) {
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
 
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
@@ -2147,8 +2127,8 @@ int op_matmul(struct htp_ops_context * octx) {
             break;
 
         case HTP_TYPE_Q8_0:
-            op_type        = "q8x4x2-fp32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            op_type        = "q8x4x2-f32";
+            quant_job_func = htp_quantize_f32_q8x4x2;
             if (src1_nrows > 1) {
                 matmul_job_func = htp_matmul_2d_q8x4x2_q8x4x2;
             } else {
@@ -2160,12 +2140,12 @@ int op_matmul(struct htp_ops_context * octx) {
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
 
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
@@ -2177,7 +2157,7 @@ int op_matmul(struct htp_ops_context * octx) {
 
         case HTP_TYPE_MXFP4:
             op_type        = "mxfp4x4x2-f32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            quant_job_func = htp_quantize_f32_q8x4x2;
             if (src1_nrows > 1) {
                 matmul_job_func = htp_matmul_2d_mxfp4x4x2_q8x4x2;
             } else {
@@ -2189,12 +2169,12 @@ int op_matmul(struct htp_ops_context * octx) {
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
 
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
@@ -2207,10 +2187,10 @@ int op_matmul(struct htp_ops_context * octx) {
         case HTP_TYPE_F16:
             {
                 // Try optimized f16-f16 path first (src1 in VTCM)
-                const size_t f16_src1_row_size  = htp_round_up(ne10 * 2, 128);
-                const size_t f16_src1_spad_size = htp_round_up(f16_src1_row_size * src1_nrows, 256);
-                const size_t f16_src0_spad_size = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
-                const size_t f16_dst_spad_size  = htp_round_up(MM_SPAD_DST_NROWS  * dst_row_size, 256) * octx->n_threads;
+                const size_t f16_src1_row_size  = hex_round_up(ne10 * 2, 128);
+                const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256);
+                const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
+                const size_t f16_dst_spad_size  = hex_round_up(MM_SPAD_DST_NROWS  * dst_row_size, 256) * octx->n_threads;
 
                 const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size;
 
@@ -2222,7 +2202,7 @@ int op_matmul(struct htp_ops_context * octx) {
                 if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) {
                     // Optimized path
                     op_type        = "f16-f16";
-                    quant_job_func = (src1->type == HTP_TYPE_F32) ? htp_quantize_fp32_fp16 : htp_quantize_fp16_fp16;
+                    quant_job_func = (src1->type == HTP_TYPE_F32) ? htp_quantize_f32_f16 : htp_quantize_f16_f16;
                     if (src1_nrows > 1) {
                         matmul_job_func = htp_matmul_2d_f16_f16;
                     } else {
@@ -2231,9 +2211,9 @@ int op_matmul(struct htp_ops_context * octx) {
 
                     src1_row_size = f16_src1_row_size; // row size post quantization
 
-                    octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-                    octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-                    octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+                    octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+                    octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+                    octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
 
                     octx->src1_spad.size = octx->src1_spad.size_per_thread;
                     octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
@@ -2251,9 +2231,9 @@ int op_matmul(struct htp_ops_context * octx) {
 
                     src1_row_size = nb11; // original row size in DDR
 
-                    octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-                    octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
-                    octx->src1_spad.size_per_thread = htp_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
+                    octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+                    octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
+                    octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
 
                     octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
                     octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
@@ -2332,7 +2312,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
 
-    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+    const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
 
     const uint32_t src0_nrows = ne01;  // per expert
     const uint32_t src1_nrows = ne11 * ne12 * ne13;
@@ -2350,7 +2330,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
     switch (src0->type) {
         case HTP_TYPE_Q4_0:
             op_type        = "q4x2x2-f32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            quant_job_func = htp_quantize_f32_q8x4x2;
             src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
             if (src1_nrows > 1) {
                 matmul_id_job_func = htp_matmul_id_q4x4x2_q8x4x2;
@@ -2360,13 +2340,13 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
@@ -2379,7 +2359,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
         case HTP_TYPE_Q8_0:
             op_type        = "q8x2x2-f32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            quant_job_func = htp_quantize_f32_q8x4x2;
             src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
             if (src1_nrows > 1) {
                 matmul_id_job_func = htp_matmul_id_q8x4x2_q8x4x2;
@@ -2389,13 +2369,13 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
@@ -2408,7 +2388,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
         case HTP_TYPE_MXFP4:
             op_type        = "mxfp4x2x2-f32";
-            quant_job_func = htp_quantize_fp32_q8x4x2;
+            quant_job_func = htp_quantize_f32_q8x4x2;
             src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
             if (src1_nrows > 1) {
                 matmul_id_job_func = htp_matmul_id_mxfp4x4x2_q8x4x2;
@@ -2418,13 +2398,13 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
             // Entire src1 tensor is placed into the VTCM
             // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
 
             // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
             if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
                 octx->src0_spad.size_per_thread = src1_row_size_padded;
             }
diff --git a/src/ggml-hexagon/htp/ops-utils.h b/src/ggml-hexagon/htp/ops-utils.h
deleted file mode 100644
index af9c3305..00000000
--- a/src/ggml-hexagon/htp/ops-utils.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef OPS_UTILS_H
-#define OPS_UTILS_H
-
-#include "htp-msg.h"
-
-#ifndef MAX
-#    define MAX(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
-#ifndef MIN
-#    define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-static inline uint64_t htp_get_cycles() {
-    uint64_t cycles = 0;
-    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
-    return cycles;
-}
-
-static inline uint64_t htp_get_pktcnt() {
-    uint64_t pktcnt;
-    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
-    return pktcnt;
-}
-
-static inline int32_t htp_is_aligned(void * addr, uint32_t align) {
-    return ((size_t) addr & (align - 1)) == 0;
-}
-
-static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
-    return m * ((n + m - 1) / m);
-}
-
-// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
-// Precompute mp (m' in the paper) and L such that division
-// can be computed using a multiply (high 32b of 64b result)
-// and a shift:
-//
-// n/d = (mulhi(n, mp) + n) >> L;
-struct fastdiv_values {
-    uint32_t mp;
-    uint32_t l;
-};
-
-static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
-    struct fastdiv_values result = { 0, 0 };
-    // compute L = ceil(log2(d));
-    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
-        ++(result.l);
-    }
-
-    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
-    return result;
-}
-
-static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
-    // Compute high 32 bits of n * mp
-    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
-    // add n, apply bit shift
-    return (hi + n) >> vals->l;
-}
-
-static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
-    return n - fastdiv(n, vals) * d;
-}
-
-static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
-    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
-    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
-}
-
-static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
-    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) {
-    uint32_t n0 = n / 16;
-    uint32_t n1 = n % 16;
-
-    uint32_t i = 0;
-    for (; i < n0; i++) {
-        htp_dump_fp32_line(pref, x + (16 * i), 16);
-    }
-    if (n1) {
-        htp_dump_fp32_line(pref, x + (16 * i), n1);
-    }
-}
-
-static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
-    uint32_t n0 = n / 16;
-    uint32_t n1 = n % 16;
-
-    uint32_t i = 0;
-    for (; i < n0; i++) {
-        htp_dump_fp16_line(pref, x + (16 * i), 16);
-    }
-    if (n1) {
-        htp_dump_fp16_line(pref, x + (16 * i), n1);
-    }
-}
-
-#endif /* OPS_UTILS_H */
diff --git a/src/ggml-hexagon/htp/rope-ops.c b/src/ggml-hexagon/htp/rope-ops.c
index a4399704..943ca5c9 100644
--- a/src/ggml-hexagon/htp/rope-ops.c
+++ b/src/ggml-hexagon/htp/rope-ops.c
@@ -2,27 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 // Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
 #define HTP_ROPE_TYPE_NORMAL 0
@@ -370,8 +363,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
 
     int is_aligned = 1;
     int opt_path   = 0;
-    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
         FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
         is_aligned = 0;
     }
@@ -427,9 +420,9 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
 
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
diff --git a/src/ggml-hexagon/htp/set-rows-ops.c b/src/ggml-hexagon/htp/set-rows-ops.c
index bdd64fcc..904484da 100644
--- a/src/ggml-hexagon/htp/set-rows-ops.c
+++ b/src/ggml-hexagon/htp/set-rows-ops.c
@@ -2,24 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define set_rows_preamble \
     const uint32_t ne00 = octx->src0.ne[0]; \
@@ -76,7 +72,7 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
                 const uintptr_t dst_ptr  = octx->dst.data  + i1*nb1 + i02*nb2  + i03*nb3;
 
                 // copy row
-                hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
+                hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
             }
         }
     }
@@ -112,7 +108,7 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
                 const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
                 uint8_t*       dst_ptr  = (uint8_t *)       octx->dst.data  + i1*nb1 + i02*nb2  + i03*nb3;
 
-                hvx_copy_fp16_fp32_uu(dst_ptr, src0_ptr, ne00);
+                hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
             }
         }
     }
diff --git a/src/ggml-hexagon/htp/softmax-ops.c b/src/ggml-hexagon/htp/softmax-ops.c
index 80d249a2..1b6b2eba 100644
--- a/src/ggml-hexagon/htp/softmax-ops.c
+++ b/src/ggml-hexagon/htp/softmax-ops.c
@@ -2,27 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define htp_softmax_preamble3                              \
     const uint32_t ne00 = src0->ne[0];                     \
@@ -100,8 +93,8 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
     uint8_t * restrict dst_curr        = dst;
     const uint8_t * restrict mask_curr = mask;
 
-    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
-    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
+    HVX_Vector scale_vec = hvx_vec_splat_f32(scale);
+    HVX_Vector slope_vec = hvx_vec_splat_f32(slope);
 
     int step_of_1 = num_elems >> 5;
 
@@ -134,9 +127,9 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
     HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
-    HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);
+    HVX_Vector max_vec = hvx_vec_splat_f32(((const float *) src)[0]);
     HVX_Vector zero_v  = Q6_V_vzero();
-    HVX_Vector one_v   = hvx_vec_splat_fp32(1.0);
+    HVX_Vector one_v   = hvx_vec_splat_f32(1.0);
 
     int step_of_1 = num_elems >> 5;
 
@@ -146,7 +139,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
         max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
     }
 
-    HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
+    HVX_Vector v = hvx_vec_reduce_max_f32(max_vec);
     max_vec      = hvx_vec_repl4(v);
 
     #pragma unroll(4)
@@ -154,18 +147,18 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
         HVX_Vector v1 = v_src[i];
         HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
 
-        HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));
+        HVX_Vector v3 = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(v2));
 
         sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);
 
         v_pad[i] = v3;
     }
 
-    v       = hvx_vec_qf32_reduce_sum(sum_vec);
+    v       = hvx_vec_reduce_sum_qf32(sum_vec);
     sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
 
     HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
-    HVX_Vector     v4        = hvx_vec_inverse_fp32(sum_vec);
+    HVX_Vector     v4        = hvx_vec_inverse_f32(sum_vec);
     HVX_Vector     scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
 
     #pragma unroll(4)
@@ -181,11 +174,11 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
                              uint8_t * restrict spad,
                              const int   num_elems,
                              const float max) {
-    hvx_sub_scalar_f32(src, max, spad, num_elems);
+    hvx_sub_scalar_f32(spad, src, max, num_elems);
 
     hvx_exp_f32(spad, dst, num_elems, false);
 
-    float sum = hvx_self_sum_f32(dst, num_elems);
+    float sum = hvx_reduce_sum_f32(dst, num_elems);
 
     return sum;
 }
@@ -255,7 +248,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct
                 if (1 == opt_path) {
                     hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
                 } else {
-                    float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
+                    float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
                     float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
                     sum       = sum > 0.0 ? (1.0 / sum) : 1;
                     hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
@@ -290,7 +283,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
 
     int is_aligned = 1;
     int opt_path   = 0;
-    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+    if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) {
         is_aligned = 0;
         FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
     }
@@ -345,9 +338,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
 
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
diff --git a/src/ggml-hexagon/htp/unary-ops.c b/src/ggml-hexagon/htp/unary-ops.c
index 8ed1e5b6..be8be8c4 100644
--- a/src/ggml-hexagon/htp/unary-ops.c
+++ b/src/ggml-hexagon/htp/unary-ops.c
@@ -2,28 +2,20 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
-
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
+
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
 
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
 
 #define htp_unary_preamble            \
     const uint32_t ne00 = src->ne[0]; \
@@ -55,7 +47,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
     HVX_Vector sum_v     = Q6_V_vsplat_R(0x00000000);
-    HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
 
     int step_of_1 = num_elems >> 5;
     #pragma unroll(4)
@@ -65,15 +57,15 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
         sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
     }
 
-    HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v);
+    HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v);
     sum_v                  = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
 
-    HVX_Vector t_v            = hvx_vec_splat_fp32((float) num_elems);
-    HVX_Vector denom_v        = hvx_vec_inverse_fp32(t_v);
+    HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
+    HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
     HVX_Vector mean_v         = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
     HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
 
-    HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
+    HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
 
     #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
@@ -101,7 +93,7 @@ static void scale_htp_f32(const float * restrict src,
         float * restrict dst_local       = dst + (ir * row_elems);
 
         if (ir + 1 < num_rows) {
-            htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
+            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
         }
 
         hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
@@ -124,7 +116,7 @@ static void rms_norm_htp_f32(const float * restrict src,
         float * restrict dst_local       = dst + (ir * row_elems);
 
         if (ir + 1 < num_rows) {
-            htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
+            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
         }
 
         if (1 == opt_path) {
@@ -168,9 +160,8 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
 
     int is_aligned = 1;
     int opt_path   = 0;
-    if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+    if ((0 == hex_is_aligned((void *) src->data, VLEN)) || (0 == hex_is_aligned((void *) dst->data, VLEN))) {
         is_aligned = 0;
-        FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n");
     }
     if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
         opt_path = 1;
@@ -240,8 +231,8 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const size_t dst_row_size  = dst->nb[1];
 
     // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
 
     size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
 
diff --git a/src/ggml-hexagon/htp/worker-pool.c b/src/ggml-hexagon/htp/worker-pool.c
index cd38c212..894815f4 100644
--- a/src/ggml-hexagon/htp/worker-pool.c
+++ b/src/ggml-hexagon/htp/worker-pool.c
@@ -7,10 +7,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
-
 #include "HAP_farf.h"
 
 #define WORKER_THREAD_STACK_SZ  (2 * 16384)