From: Max Krasnyansky Date: Fri, 30 Jan 2026 08:28:03 +0000 (+0200) Subject: hexagon: support for OP_CPY, host buffers now optional (llama/18822) X-Git-Tag: v0.9.6~69 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c2f1fb20ae37140b8d2d548d473af80fd1d4bfb5;p=pkg%2Fggml%2Fsources%2Fggml hexagon: support for OP_CPY, host buffers now optional (llama/18822) --- diff --git a/src/ggml-hexagon/ggml-hexagon.cpp b/src/ggml-hexagon/ggml-hexagon.cpp index 365a24b4..cf1eb994 100644 --- a/src/ggml-hexagon/ggml-hexagon.cpp +++ b/src/ggml-hexagon/ggml-hexagon.cpp @@ -42,12 +42,12 @@ #include "htp_iface.h" static size_t opt_ndev = 1; -static size_t opt_nhvx = 0; // use all -static int opt_arch = 0; // autodetect +static size_t opt_nhvx = 0; // use all +static int opt_arch = 0; // autodetect static int opt_etm = 0; static int opt_verbose = 0; static int opt_profile = 0; -static int opt_hostbuf = 1; +static int opt_hostbuf = 1; // hostbuf ON by default static int opt_experimental = 0; // Enable all stages by default @@ -1753,6 +1753,9 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) } static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) { + if (!opt_hostbuf) { + return ggml_backend_buffer_is_hexagon(b); + } return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer; } @@ -2302,6 +2305,16 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu return n_bufs; } +static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { + req->op = HTP_OP_CPY; + + size_t n_bufs = 0; + n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); + n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); + + return n_bufs; +} + static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { req->op = HTP_OP_GET_ROWS; @@ -2557,6 +2570,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg ggml_hexagon_dispatch_op(sess, node, flags); break; + case GGML_OP_CPY: + ggml_hexagon_dispatch_op(sess, node, flags); + break; + default: GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node)); } @@ -2858,6 +2875,27 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str return true; } +static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + // for now we can do f32 -> f16 and f16 -> f32 (without reshaping) + if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false; + if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false; + + const bool sametype = (src0->type == dst->type); + const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst); + const bool sameshape = !transposed && ggml_are_same_shape(src0, dst); + + // can handle any shape and any same-type (pretty slow if reshaping is required) + if (sametype) return true; + + // cannot handle re-shaping and type conversion at the same time + if (!sameshape) return false; + + return true; +} + static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { auto sess = static_cast(dev->context); @@ -2936,6 +2974,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_get_rows(sess, op); break; + case GGML_OP_CPY: + supp = ggml_hexagon_supported_cpy(sess, op); + break; + default: break; } @@ -3061,7 +3103,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t } static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { + if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) { ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type; return (void *) fct; } @@ -3078,34 +3120,31 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4, "please update hexagon_type to match ggml_type"); + const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL"); const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE"); const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF"); - + const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); + const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC"); + const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); + const char * str_etm = getenv("GGML_HEXAGON_ETM"); + const char * str_nhvx = getenv("GGML_HEXAGON_NHVX"); + const char * str_ndev = getenv("GGML_HEXAGON_NDEV"); + const char * str_arch = getenv("GGML_HEXAGON_ARCH"); + + opt_experimental = str_experimental ? atoi(str_experimental) : 0; opt_verbose = str_verbose ? atoi(str_verbose) : 0; - opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr; - opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr; - opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr; - - const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); - if (str_opmask != nullptr) { - opt_opmask = strtoul(str_opmask, NULL, 0); - } - opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr; + opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; + opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask; + opt_opsync = str_opsync ? atoi(str_opsync) : 0; + opt_profile = str_profile ? atoi(str_profile) : 0; + opt_etm = str_etm ? atoi(str_etm) : 0; + opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; + opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; - const char * str_ndev = getenv("GGML_HEXAGON_NDEV"); - if (str_ndev) { - opt_ndev = strtoul(str_ndev, NULL, 0); - if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) { - opt_ndev = GGML_HEXAGON_MAX_SESSIONS; - } + if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) { + opt_ndev = GGML_HEXAGON_MAX_SESSIONS; } - const char * str_nhvx = getenv("GGML_HEXAGON_NHVX"); - if (str_nhvx) { - opt_nhvx = strtoul(str_nhvx, NULL, 0); - } - - const char * str_arch = getenv("GGML_HEXAGON_ARCH"); if (str_arch) { if (str_arch[0] == 'v') { str_arch++; @@ -3113,8 +3152,6 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_arch = strtoul(str_arch, NULL, 0); } - opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1; - reg->context = new ggml_hexagon_registry(reg); HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req), diff --git a/src/ggml-hexagon/htp/CMakeLists.txt b/src/ggml-hexagon/htp/CMakeLists.txt index 6a34a215..e8ef2030 100644 --- a/src/ggml-hexagon/htp/CMakeLists.txt +++ b/src/ggml-hexagon/htp/CMakeLists.txt @@ -17,11 +17,7 @@ add_library(${HTP_LIB} SHARED main.c htp_iface_skel.c worker-pool.c - htp-dma.c - hvx-sigmoid.c - hvx-inverse.c - hvx-exp.c - hvx-utils.c + hex-dma.c matmul-ops.c binary-ops.c unary-ops.c @@ -31,10 +27,12 @@ add_library(${HTP_LIB} SHARED flash-attn-ops.c set-rows-ops.c get-rows-ops.c + cpy-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE $,HTP_DEBUG=1,NDEBUG=1> + $,FARF_HIGH=1,> FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}) build_idl(htp_iface.idl ${HTP_LIB}) diff --git a/src/ggml-hexagon/htp/act-ops.c b/src/ggml-hexagon/htp/act-ops.c index 88bd2ddc..c3daf5ad 100644 --- a/src/ggml-hexagon/htp/act-ops.c +++ b/src/ggml-hexagon/htp/act-ops.c @@ -2,27 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" #define htp_act_preamble3 \ const uint32_t ne00 = src0->ne[0]; \ @@ -76,7 +69,7 @@ const uint32_t nb2 = dst->nb[2]; \ const uint32_t nb3 = dst->nb[3]; -static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, +static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0, const struct htp_tensor * src1, struct htp_tensor * dst, const int32_t * op_params, @@ -124,9 +117,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, data_src1 += swapped ? 0 : nc_in_bytes; } - const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); - const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); - const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); + const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread); @@ -175,9 +168,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float)); //swiglu(x) = x1 * sigmoid(x0) - hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, nc); - hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, - (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc); + hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, nc); + hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, + (const uint8_t *) src1_spad_ptr, nc); } dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size, @@ -203,7 +196,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, +static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0, const struct htp_tensor * src1, struct htp_tensor * dst, const int32_t * op_params, @@ -249,9 +242,9 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, data_src1 += swapped ? 0 : nc_in_bytes; } - const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); - const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); - const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); + const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread); @@ -304,18 +297,18 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float)); // x (src0_spad_data) = std::min(src0_p[k], limit); - hvx_min_scalar_f32((const uint8_t *) src0_spad_ptr, limit, (uint8_t *) src0_spad_ptr, nc); + hvx_min_scalar_f32((uint8_t *) src0_spad_ptr, (const uint8_t *) src0_spad_ptr, limit, nc); // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); - hvx_clamp_scalar_f32((const uint8_t *) src1_spad_ptr, -limit, limit, (uint8_t *) src1_spad_ptr, nc); + hvx_clamp_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, -limit, limit, nc); // y (src1_spad_data) = y1 + 1.f - hvx_add_scalar_f32((const uint8_t *) src1_spad_ptr, 1.0, (uint8_t *) src1_spad_ptr, nc); + hvx_add_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, 1.0, nc); // x1 (dst_spad_data) = alpha * (x) - hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, alpha, (uint8_t *) dst_spad_ptr, nc); + hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, alpha, nc); // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1)) - hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, nc); + hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc); // out = x * sigmoid(alpha * x) * (y + 1.f) - hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, - (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc); + hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, + (const uint8_t *) src1_spad_ptr, nc); } dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size, @@ -342,7 +335,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, } -static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, +static void unary_gelu_f32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, struct htp_spad * src0_spad, @@ -358,8 +351,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; - const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); - const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); const uint32_t src0_nrows = ne01 * ne02 * ne03; @@ -415,9 +408,9 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float)); // gelu = x * sigmoid(1.702 * x) // current implementation - hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0); - hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0); - hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0); + hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0); + hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0); + hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0); } dma_queue_push_vtcm_to_ddr(dma_queue, @@ -442,15 +435,15 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { +static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; - unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, + unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); } -static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, +static void unary_silu_f32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, struct htp_spad * src0_spad, @@ -466,8 +459,8 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; - const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); - const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); const uint32_t src0_nrows = ne01 * ne02 * ne03; @@ -522,8 +515,8 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float)); // silu = x * sigmoid(x) - hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, ne0); - hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0); + hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0); + hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0); } dma_queue_push_vtcm_to_ddr(dma_queue, @@ -548,25 +541,25 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) { +static void unary_silu_f32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; - unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, + unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); } -static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) { +static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; - glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, + glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); } -static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) { +static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; - glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, + glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); } -static int execute_op_activations_fp32(struct htp_ops_context * octx) { +static int execute_op_activations_f32(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; const struct htp_tensor * src0 = &octx->src0; @@ -583,21 +576,21 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { switch (octx->op) { case HTP_OP_UNARY_SILU: - act_op_func = unary_silu_fp32; + act_op_func = unary_silu_f32; op_type = "silu-f32"; break; case HTP_OP_GLU_SWIGLU: - act_op_func = glu_swiglu_fp32; + act_op_func = glu_swiglu_f32; op_type = "swiglu-f32"; break; case HTP_OP_GLU_SWIGLU_OAI: - act_op_func = glu_swiglu_oai_fp32; + act_op_func = glu_swiglu_oai_f32; op_type = "swiglu-oai-f32"; break; case HTP_OP_UNARY_GELU: - act_op_func = unary_gelu_fp32; + act_op_func = unary_gelu_f32; op_type = "gelu-f32"; break; default: @@ -617,9 +610,9 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { src1_row_size = src0_row_size; } - const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); - const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); - const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); + const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size @@ -670,7 +663,7 @@ int op_activations(struct htp_ops_context * octx) { switch (octx->src0.type) { case HTP_TYPE_F32: - err = execute_op_activations_fp32(octx); + err = execute_op_activations_f32(octx); break; default: diff --git a/src/ggml-hexagon/htp/binary-ops.c b/src/ggml-hexagon/htp/binary-ops.c index 8ed7f67d..de22afe4 100644 --- a/src/ggml-hexagon/htp/binary-ops.c +++ b/src/ggml-hexagon/htp/binary-ops.c @@ -2,36 +2,25 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif - #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" -typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0, - const uint8_t * src1, - uint8_t * data_dst, - const int num_elems); +typedef void (*hvx_elemwise_f32_func)(uint8_t * data_dst, const uint8_t * src0, const uint8_t * src1, const uint32_t num_elems); static hvx_elemwise_f32_func func_table_HVX[] = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 }; -static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt }; +static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_aa, hvx_add_f32_aa, hvx_sub_f32_aa }; #define htp_binary_preamble \ const struct htp_tensor * src0 = &octx->src0; \ @@ -98,9 +87,8 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx, int is_aligned = 1; int opt_path = 0; - if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) || - (0 == htp_is_aligned((void *) dst->data, VLEN))) { - FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n"); + if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) || + (0 == hex_is_aligned((void *) dst->data, VLEN))) { is_aligned = 0; } if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { @@ -130,24 +118,24 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx, const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size; if (ir + 1 < src0_end_row) { - htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size); + hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1); if (src1_row_size == src0_row_size) { - htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size); + hex_l2fetch(src1_ptr, src1_row_size, src1_row_size, 1); } } const uint32_t nr0 = ne00 / ne10; if (nr0 > 1) { if ((1 == is_aligned) && (nr0 == ne00)) { - hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0); + hvx_splat_f32_a(spad_data_th, *(float *) src1_ptr, nr0); } else { for (uint32_t r = 0; r < nr0; r++) { memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11); } } - func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00); + func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, ne00); } else { - func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00); + func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00); } src0_ptr += src0_row_size; @@ -185,11 +173,6 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx, uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); - if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) || - (0 == htp_is_aligned((void *) dst->data, VLEN))) { - FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n"); - } - const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; uint8_t * restrict data_dst = (uint8_t *) dst->data; @@ -210,9 +193,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx, const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11); if (ir + 1 < src0_end_row) { - htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size); + hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1); if (src1_row_size == src0_row_size) { - htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size); + hex_l2fetch(src1_ptr + ne10, src1_row_size, src1_row_size, 1); } } @@ -221,9 +204,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx, for (uint32_t r = 0; r < nr0; r++) { memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10); } - func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00); + func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data, ne00); } else { - func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00); + func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00); } } @@ -299,9 +282,9 @@ static int execute_op_binary_f32(struct htp_ops_context * octx) { const size_t dst_row_size = dst->nb[1]; // VTCM scratchpads for all tensors - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads; - octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads; + octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads; + octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads; + octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads; size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; diff --git a/src/ggml-hexagon/htp/cpy-ops.c b/src/ggml-hexagon/htp/cpy-ops.c new file mode 100644 index 00000000..559ca183 --- /dev/null +++ b/src/ggml-hexagon/htp/cpy-ops.c @@ -0,0 +1,251 @@ +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include + +#include +#include + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" +#include "htp-msg.h" +#include "htp-ops.h" +#include "hvx-utils.h" + +struct htp_copy_context { + struct htp_ops_context * octx; + + uint32_t src0_type_size; + uint32_t src0_block_size; + + uint32_t dst_type_size; + uint32_t dst_block_size; + + uint32_t src0_blocks_per_row; + uint32_t dst_blocks_per_row; + + uint32_t src0_nrows_per_thread; + + void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith); +}; + +#define cpy_preamble \ + struct htp_tensor *src0 = &octx->src0; \ + struct htp_tensor *dst = &octx->dst; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ + const uint32_t nr = ne01; + +static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { + cpy_preamble; + + // parallelize by src0 rows + const uint32_t dr = ct->src0_nrows_per_thread; + const uint32_t ir0 = dr * ith; + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + + // copy by rows + for (uint32_t i03 = 0; i03 < ne03; i03++) { + for (uint32_t i02 = 0; i02 < ne02; i02++) { + #pragma unroll(2) + for (uint32_t i01 = ir0; i01 < ir1; i01++) { + uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; + uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2); + hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size); + } + } + } +} + +static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) { + cpy_preamble; + + // parallelize by src0 rows + const uint32_t dr = ct->src0_nrows_per_thread; + const uint32_t ir0 = dr * ith; + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + + // dst counters + int64_t k10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + // number of blocks in a row + const int64_t nk00 = ct->src0_blocks_per_row; + const int64_t nk0 = ct->dst_blocks_per_row; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + k10 += nk00 * ir0; + while (k10 >= nk0) { + k10 -= nk0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t k00 = 0; k00 < nk00; k00++) { + const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + memcpy(dst_ptr, src0_ptr, ct->dst_type_size); + + if (++k10 == nk0) { + k10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + k10 += nk00 * (ne01 - ir1); + while (k10 >= nk0) { + k10 -= nk0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } +} + +static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { + cpy_preamble; + + // parallelize by src0 rows + const uint32_t dr = ct->src0_nrows_per_thread; + const uint32_t ir0 = dr * ith; + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + + // copy by rows + for (uint32_t i03 = 0; i03 < ne03; i03++) { + for (uint32_t i02 = 0; i02 < ne02; i02++) { + #pragma unroll(2) + for (uint32_t i01 = ir0; i01 < ir1; i01++) { + uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; + uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + hex_l2fetch(src0_ptr, ne00 * sizeof(float), nb01, 2); + hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00); + } + } + } +} + +static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { + cpy_preamble; + + // parallelize by src0 rows + const uint32_t dr = ct->src0_nrows_per_thread; + const uint32_t ir0 = dr * ith; + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + + // copy by rows + for (uint32_t i03 = 0; i03 < ne03; i03++) { + for (uint32_t i02 = 0; i02 < ne02; i02++) { + #pragma unroll(2) + for (uint32_t i01 = ir0; i01 < ir1; i01++) { + uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; + uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + hex_l2fetch(src0_ptr, ne00 * sizeof(__fp16), nb01, 2); + hvx_copy_f32_f16_uu(dst_ptr, src0_ptr, ne00); + } + } + } +} + +static void cpy_work_func(unsigned int n, unsigned int i, void *data) { + struct htp_copy_context *ct = (struct htp_copy_context *) data; + ct->copy(ct, ct->octx, n, i); +} + +int op_cpy(struct htp_ops_context * octx) { + cpy_preamble; + + struct htp_copy_context ct; + ct.octx = octx; + + switch (src0->type) { + case HTP_TYPE_F32: ct.src0_type_size = 4; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break; + case HTP_TYPE_F16: ct.src0_type_size = 2; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break; + default: + return HTP_STATUS_NO_SUPPORT; + } + + switch (dst->type) { + case HTP_TYPE_F32: ct.dst_type_size = 4; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break; + case HTP_TYPE_F16: ct.dst_type_size = 2; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break; + default: + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + const bool sametype = (src0->type == dst->type); + const bool transposed = (nb00 > nb01) || (nb0 > nb1); + const bool sameshape = !transposed && (ne00 == ne0 && ne01 == ne1 && ne02 == ne2 && ne03 == ne3); + + const uint32_t n_jobs = MIN(nr, octx->n_threads); + ct.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs; + + if (sametype && sameshape) { + ct.copy = cpy_thread_sametype_sameshape; + } else if (sameshape) { + /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32) + ct.copy = cpy_thread_f16_f32_sameshape; + else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16) + ct.copy = cpy_thread_f32_f16_sameshape; + else + return HTP_STATUS_NO_SUPPORT; + } else if (sametype) { + ct.copy = cpy_thread_sametype_reshape; + } else { + return HTP_STATUS_NO_SUPPORT; + } + + worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_jobs); + + return HTP_STATUS_OK; +} diff --git a/src/ggml-hexagon/htp/flash-attn-ops.c b/src/ggml-hexagon/htp/flash-attn-ops.c index 04a7b843..1de47d0f 100644 --- a/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/src/ggml-hexagon/htp/flash-attn-ops.c @@ -2,25 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include + #include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" // Dot product of FP32 and FP16 vectors, accumulating to float static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) { @@ -70,8 +65,8 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s)); - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum)); + rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s)); + rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); hvx_vec_store_u(r, 4, rsum); } @@ -111,8 +106,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s)); - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum)); + rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s)); + rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); hvx_vec_store_u(r, 4, rsum); } @@ -124,7 +119,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nloe = n % VLEN_FP16; // leftover elements - HVX_Vector S = hvx_vec_splat_fp16(s); + HVX_Vector S = hvx_vec_splat_f16(s); uint32_t i = 0; #pragma unroll(4) @@ -148,7 +143,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict if (nloe) { HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i])); - hvx_vec_store_u(&ptr_y[i], nloe * 4, xy); + hvx_vec_store_a(&ptr_y[i], nloe * 4, xy); } } } @@ -225,18 +220,18 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in const uint32_t DV = nev0; const size_t size_q_row = DK * ((q->type == HTP_TYPE_F32) ? 4 : 2); - const size_t size_q_row_padded = htp_round_up(size_q_row, 128); + const size_t size_q_row_padded = hex_round_up(size_q_row, 128); const size_t size_k_row = DK * sizeof(__fp16); const size_t size_v_row = DV * sizeof(__fp16); const size_t size_m_row = FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16); // Treat block as one row for mask - const size_t size_k_row_padded = htp_round_up(size_k_row, 128); - const size_t size_v_row_padded = htp_round_up(size_v_row, 128); + const size_t size_k_row_padded = hex_round_up(size_k_row, 128); + const size_t size_v_row_padded = hex_round_up(size_v_row, 128); const size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE; const size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE; - const size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128); + const size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128); // Scratchpad buffers for Q, K, V, Mask, and VKQ32 accumulator uint8_t * spad_q = octx->src0_spad.data + octx->src0_spad.size_per_thread * ith; @@ -272,8 +267,8 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in float M = -INFINITY; // maximum KQ value // Clear accumulator + hvx_splat_f32_a(spad_a, 0, DV); float * VKQ32 = (float *) spad_a; - memset(VKQ32, 0, DV * sizeof(float)); const __fp16 * mp_base = NULL; if (mask) { @@ -340,30 +335,30 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in // 2. Softcap if (logit_softcap != 0.0f) { - scores = hvx_vec_tanh_fp32(scores); - scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_fp32(logit_softcap)); + scores = hvx_vec_tanh_f32(scores); + scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_f32(logit_softcap)); scores = Q6_Vsf_equals_Vqf32(scores); } // 3. Mask if (mask) { const __fp16 * mp = m_base + ic; - HVX_Vector m_vals_fp16 = *(const HVX_UVector *) mp; + HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp; - HVX_Vector one_fp16 = Q6_Vh_vsplat_R(0x3c00); - HVX_VectorPair m_vals_fp32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_fp16), one_fp16); + HVX_Vector one_f16 = Q6_Vh_vsplat_R(0x3c00); + HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), one_f16); - HVX_Vector m_vals_fp32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_fp32_pair)); + HVX_Vector m_vals_f32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_f32_pair)); - HVX_Vector slope_vec = hvx_vec_splat_fp32(slope); - HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_fp32, slope_vec); + HVX_Vector slope_vec = hvx_vec_splat_f32(slope); + HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_f32, slope_vec); scores = Q6_Vqf32_vadd_VsfVsf(scores, Q6_Vsf_equals_Vqf32(add_val)); scores = Q6_Vsf_equals_Vqf32(scores); } // 4. Online Softmax Update - HVX_Vector v_max = hvx_vec_reduce_max_fp32(scores); - float m_block = hvx_vec_get_fp32(v_max); + HVX_Vector v_max = hvx_vec_reduce_max_f32(scores); + float m_block = hvx_vec_get_f32(v_max); float M_old = M; float M_new = (m_block > M) ? m_block : M; @@ -374,12 +369,12 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms); S = S * ms; - HVX_Vector M_new_vec = hvx_vec_splat_fp32(M_new); + HVX_Vector M_new_vec = hvx_vec_splat_f32(M_new); HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_new_vec); - HVX_Vector P = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(scores_shifted)); + HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted)); - HVX_Vector p_sum_vec = hvx_vec_fp32_reduce_sum(P); - float p_sum = hvx_vec_get_fp32(p_sum_vec); + HVX_Vector p_sum_vec = hvx_vec_reduce_sum_f32(P); + float p_sum = hvx_vec_get_f32(p_sum_vec); S += p_sum; // 5. Accumulate V @@ -484,9 +479,9 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1; if (dst->type == HTP_TYPE_F32) { - hvx_copy_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV); + hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV); } else if (dst->type == HTP_TYPE_F16) { - hvx_copy_fp16_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV); + hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV); } } } @@ -523,16 +518,16 @@ int op_flash_attn_ext(struct htp_ops_context * octx) { octx->src3_div3 = init_fastdiv_values(mask->ne[3]); } - size_t size_q_row_padded = htp_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128); - size_t size_k_row_padded = htp_round_up(k->ne[0] * sizeof(__fp16), 128); - size_t size_v_row_padded = htp_round_up(v->ne[0] * sizeof(__fp16), 128); + size_t size_q_row_padded = hex_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128); + size_t size_k_row_padded = hex_round_up(k->ne[0] * sizeof(__fp16), 128); + size_t size_v_row_padded = hex_round_up(v->ne[0] * sizeof(__fp16), 128); size_t size_q_block = size_q_row_padded * 1; // single row for now size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE; size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE; - size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128); + size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128); - size_t size_vkq_acc = htp_round_up(v->ne[0] * sizeof(float), 128); // VKQ32 + size_t size_vkq_acc = hex_round_up(v->ne[0] * sizeof(float), 128); // VKQ32 octx->src0_spad.size_per_thread = size_q_block * 1; octx->src1_spad.size_per_thread = size_k_block * 2; diff --git a/src/ggml-hexagon/htp/get-rows-ops.c b/src/ggml-hexagon/htp/get-rows-ops.c index 54321421..a657cd2d 100644 --- a/src/ggml-hexagon/htp/get-rows-ops.c +++ b/src/ggml-hexagon/htp/get-rows-ops.c @@ -2,14 +2,9 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include + #include #include @@ -19,7 +14,6 @@ #include "htp-msg.h" #include "htp-ops.h" #include "hvx-utils.h" -#include "ops-utils.h" #define get_rows_preamble \ const uint32_t ne00 = octx->src0.ne[0]; \ @@ -72,7 +66,7 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03; const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3; - hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); + hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); } return HTP_STATUS_OK; diff --git a/src/ggml-hexagon/htp/hex-dma.c b/src/ggml-hexagon/htp/hex-dma.c new file mode 100644 index 00000000..44e1be40 --- /dev/null +++ b/src/ggml-hexagon/htp/hex-dma.c @@ -0,0 +1,63 @@ +#include "hex-dma.h" + +#include +#include +#include + +#pragma clang diagnostic ignored "-Wunused-function" + +static inline uint32_t pow2_ceil(uint32_t x) { + if (x <= 1) { + return 1; + } + int p = 2; + x--; + while (x >>= 1) { + p <<= 1; + } + return p; +} + +dma_queue * dma_queue_create(size_t capacity) { + dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue)); + if (q == NULL) { + FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__); + return NULL; + } + + capacity = pow2_ceil(capacity); + + memset(q, 0, sizeof(dma_queue)); + q->capacity = capacity; + q->idx_mask = capacity - 1; + + q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t)); + memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t)); + + q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr)); + memset(q->dptr, 0, capacity * sizeof(dma_ptr)); + + q->tail = &q->desc[capacity - 1]; + + if (!q->desc && !q->dptr) { + FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__); + return NULL; + } + + FARF(HIGH, "dma-queue: capacity %u\n", capacity); + + return q; +} + +void dma_queue_delete(dma_queue * q) { + if (!q) { + return; + } + free(q->desc); + free(q->dptr); + free(q); +} + +void dma_queue_flush(dma_queue * q) { + while (dma_queue_pop(q).dst != NULL) ; +} diff --git a/src/ggml-hexagon/htp/hex-dma.h b/src/ggml-hexagon/htp/hex-dma.h new file mode 100644 index 00000000..d1ddb0ec --- /dev/null +++ b/src/ggml-hexagon/htp/hex-dma.h @@ -0,0 +1,156 @@ +#ifndef HTP_DMA_H +#define HTP_DMA_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + void *dst; + const void *src; +} dma_ptr; + +typedef struct { + hexagon_udma_descriptor_type1_t * desc; // descriptor pointers + hexagon_udma_descriptor_type1_t * tail; // tail pointer + dma_ptr * dptr; // dst/src pointers + uint32_t push_idx; + uint32_t pop_idx; + uint32_t capacity; + uint32_t idx_mask; +} dma_queue; + +dma_queue * dma_queue_create(size_t capacity); +void dma_queue_delete(dma_queue * q); +void dma_queue_flush(dma_queue * q); + +// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead +// but those do not seem to always compiler properly. +static inline void dmstart(void * next) { + asm volatile(" release(%0):at" : : "r"(next)); + asm volatile(" dmstart(%0)" : : "r"(next)); +} + +static inline void dmlink(void * cur, void * next) { + asm volatile(" release(%0):at" : : "r"(next)); + asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next)); +} + +static inline unsigned int dmpoll(void) { + unsigned int ret = 0; + asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory"); + return ret; +} + +static inline unsigned int dmwait(void) { + unsigned int ret = 0; + asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory"); + return ret; +} + +static inline dma_ptr dma_make_ptr(void *dst, const void *src) +{ + dma_ptr p = { dst, src }; + return p; +} + +static inline bool dma_queue_push(dma_queue * q, + dma_ptr dptr, + size_t dst_row_size, + size_t src_row_size, + size_t width, // width in bytes. number of bytes to transfer per row + size_t nrows) { + if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { + FARF(ERROR, "dma-push: queue full\n"); + return false; + } + + hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx]; + + desc->next = NULL; + desc->length = 0; + desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1; + desc->dstbypass = 1; + desc->srcbypass = 1; +#if __HVX_ARCH__ >= 73 + desc->dstbypass = 1; + desc->srcbypass = 1; +#else + desc->dstbypass = 0; + desc->srcbypass = 1; +#endif + desc->order = 0; + desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE; + desc->src = (void *) dptr.src; + desc->dst = (void *) dptr.dst; + desc->allocation = 0; + desc->padding = 0; + desc->roiwidth = width; + desc->roiheight = nrows; + desc->srcstride = src_row_size; + desc->dststride = dst_row_size; + desc->srcwidthoffset = 0; + desc->dstwidthoffset = 0; + + q->dptr[q->push_idx] = dptr; + + dmlink(q->tail, desc); + q->tail = desc; + + // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src); + q->push_idx = (q->push_idx + 1) & q->idx_mask; + return true; +} + +static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, + dma_ptr dptr, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows); +} + + +static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, + dma_ptr dptr, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows); +} + +static inline dma_ptr dma_queue_pop(dma_queue * q) { + dma_ptr dptr = { NULL }; + + if (q->push_idx == q->pop_idx) { + return dptr; + } + + hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx]; + + // Wait for desc to complete + while (1) { + dmpoll(); + if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) { + break; + } + // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); + } + + dptr = q->dptr[q->pop_idx]; + + // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst); + q->pop_idx = (q->pop_idx + 1) & q->idx_mask; + return dptr; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HTP_DMA_H */ diff --git a/src/ggml-hexagon/htp/hex-dump.h b/src/ggml-hexagon/htp/hex-dump.h new file mode 100644 index 00000000..e3badb57 --- /dev/null +++ b/src/ggml-hexagon/htp/hex-dump.h @@ -0,0 +1,77 @@ +#ifndef HEX_DUMP_H +#define HEX_DUMP_H + +#include + +static inline void hex_dump_int8_line(char * pref, const int8_t * x, int n) { + char str[1024], *p = str, *p_end = str + sizeof(str); + p += snprintf(p, p_end - p, "%s: ", pref); + for (int i = 0; i < n && p < p_end; i++) { + p += snprintf(p, p_end - p, "%d, ", x[i]); + } + FARF(HIGH, "%s\n", str); +} + +static inline void hex_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) { + char str[1024], *p = str, *p_end = str + sizeof(str); + p += snprintf(p, p_end - p, "%s: ", pref); + for (int i = 0; i < n && p < p_end; i++) { + p += snprintf(p, p_end - p, "%d, ", x[i]); + } + FARF(HIGH, "%s\n", str); +} + +static inline void hex_dump_int32_line(char * pref, const int32_t * x, uint32_t n) { + char str[1024], *p = str, *p_end = str + sizeof(str); + p += snprintf(p, p_end - p, "%s: ", pref); + for (int i = 0; i < n; i++) { + p += snprintf(p, p_end - p, "%d, ", (int) x[i]); + } + FARF(HIGH, "%s\n", str); +} + +static inline void hex_dump_f16_line(char * pref, const __fp16 * x, uint32_t n) { + char str[1024], *p = str, *p_end = str + sizeof(str); + p += snprintf(p, p_end - p, "%s: ", pref); + for (int i = 0; i < n; i++) { + p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]); + } + FARF(HIGH, "%s\n", str); +} + +static inline void hex_dump_f32_line(char * pref, const float * x, uint32_t n) { + char str[1024], *p = str, *p_end = str + sizeof(str); + p += snprintf(p, p_end - p, "%s: ", pref); + for (int i = 0; i < n; i++) { + p += snprintf(p, p_end - p, "%.6f, ", x[i]); + } + FARF(HIGH, "%s\n", str); +} + +static inline void hex_dump_f32(char * pref, const float * x, uint32_t n) { + uint32_t n0 = n / 16; + uint32_t n1 = n % 16; + + uint32_t i = 0; + for (; i < n0; i++) { + hex_dump_f32_line(pref, x + (16 * i), 16); + } + if (n1) { + hex_dump_f32_line(pref, x + (16 * i), n1); + } +} + +static inline void hex_dump_f16(char * pref, const __fp16 * x, uint32_t n) { + uint32_t n0 = n / 16; + uint32_t n1 = n % 16; + + uint32_t i = 0; + for (; i < n0; i++) { + hex_dump_f16_line(pref, x + (16 * i), 16); + } + if (n1) { + hex_dump_f16_line(pref, x + (16 * i), n1); + } +} + +#endif /* HEX_DUMP_H */ diff --git a/src/ggml-hexagon/htp/hex-fastdiv.h b/src/ggml-hexagon/htp/hex-fastdiv.h new file mode 100644 index 00000000..b7b58675 --- /dev/null +++ b/src/ggml-hexagon/htp/hex-fastdiv.h @@ -0,0 +1,37 @@ +#ifndef HEX_FASTDIV_H +#define HEX_FASTDIV_H + +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +struct fastdiv_values { + uint32_t mp; + uint32_t l; +}; + +static inline struct fastdiv_values init_fastdiv_values(uint32_t d) { + struct fastdiv_values result = { 0, 0 }; + // compute L = ceil(log2(d)); + while (result.l < 32 && ((uint32_t) 1 << result.l) < d) { + ++(result.l); + } + + result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1); + return result; +} + +static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) { + // Compute high 32 bits of n * mp + const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32); // mulhi(n, mp) + // add n, apply bit shift + return (hi + n) >> vals->l; +} + +static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) { + return n - fastdiv(n, vals) * d; +} + +#endif /* HEX_FASTDIV_H */ diff --git a/src/ggml-hexagon/htp/hex-utils.h b/src/ggml-hexagon/htp/hex-utils.h new file mode 100644 index 00000000..fb8a25a3 --- /dev/null +++ b/src/ggml-hexagon/htp/hex-utils.h @@ -0,0 +1,51 @@ +#ifndef HEX_UTILS_H +#define HEX_UTILS_H + +#include +#include + +#include "hexagon_types.h" + +#include "hex-fastdiv.h" +#include "hex-dump.h" + +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +static inline uint64_t hex_get_cycles() { + uint64_t cycles = 0; + asm volatile(" %0 = c15:14\n" : "=r"(cycles)); + return cycles; +} + +static inline uint64_t hex_get_pktcnt() { + uint64_t pktcnt; + asm volatile(" %0 = c19:18\n" : "=r"(pktcnt)); + return pktcnt; +} + +static inline int32_t hex_is_aligned(void * addr, uint32_t align) { + return ((size_t) addr & (align - 1)) == 0; +} + +static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { + uint32_t left_off = (size_t) addr & (chunk_size - 1); + uint32_t right_off = left_off + n; + return right_off <= chunk_size; +} + +static inline uint32_t hex_round_up(uint32_t n, uint32_t m) { + return m * ((n + m - 1) / m); +} + +static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) { + const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height)); + Q6_l2fetch_AP((void *) p, control); +} + +#endif /* HEX_UTILS_H */ diff --git a/src/ggml-hexagon/htp/htp-ctx.h b/src/ggml-hexagon/htp/htp-ctx.h index 4bd0ea7a..a707d982 100644 --- a/src/ggml-hexagon/htp/htp-ctx.h +++ b/src/ggml-hexagon/htp/htp-ctx.h @@ -1,7 +1,7 @@ #ifndef HTP_CTX_H #define HTP_CTX_H -#include "htp-dma.h" +#include "hex-dma.h" #include "worker-pool.h" #include diff --git a/src/ggml-hexagon/htp/htp-dma.c b/src/ggml-hexagon/htp/htp-dma.c deleted file mode 100644 index 880c4542..00000000 --- a/src/ggml-hexagon/htp/htp-dma.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "htp-dma.h" - -#include -#include -#include - -#pragma clang diagnostic ignored "-Wunused-function" - -static inline uint32_t pow2_ceil(uint32_t x) { - if (x <= 1) { - return 1; - } - int p = 2; - x--; - while (x >>= 1) { - p <<= 1; - } - return p; -} - -dma_queue * dma_queue_create(size_t capacity) { - dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue)); - if (q == NULL) { - FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__); - return NULL; - } - - capacity = pow2_ceil(capacity); - - memset(q, 0, sizeof(dma_queue)); - q->capacity = capacity; - q->idx_mask = capacity - 1; - - q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t)); - memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t)); - - q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr)); - memset(q->dptr, 0, capacity * sizeof(dma_ptr)); - - q->tail = &q->desc[capacity - 1]; - - if (!q->desc && !q->dptr) { - FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__); - return NULL; - } - - FARF(HIGH, "dma-queue: capacity %u\n", capacity); - - return q; -} - -void dma_queue_delete(dma_queue * q) { - if (!q) { - return; - } - free(q->desc); - free(q->dptr); - free(q); -} - -void dma_queue_flush(dma_queue * q) { - while (dma_queue_pop(q).dst != NULL) ; -} diff --git a/src/ggml-hexagon/htp/htp-dma.h b/src/ggml-hexagon/htp/htp-dma.h deleted file mode 100644 index 32fd06e7..00000000 --- a/src/ggml-hexagon/htp/htp-dma.h +++ /dev/null @@ -1,157 +0,0 @@ -#ifndef HTP_DMA_H -#define HTP_DMA_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - void *dst; - const void *src; -} dma_ptr; - -typedef struct { - hexagon_udma_descriptor_type1_t * desc; // descriptor pointers - hexagon_udma_descriptor_type1_t * tail; // tail pointer - dma_ptr * dptr; // dst/src pointers - uint32_t push_idx; - uint32_t pop_idx; - uint32_t capacity; - uint32_t idx_mask; -} dma_queue; - -dma_queue * dma_queue_create(size_t capacity); -void dma_queue_delete(dma_queue * q); -void dma_queue_flush(dma_queue * q); - -// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead -// but those do not seem to always compiler properly. -static inline void dmstart(void * next) { - asm volatile(" release(%0):at" : : "r"(next)); - asm volatile(" dmstart(%0)" : : "r"(next)); -} - -static inline void dmlink(void * cur, void * next) { - asm volatile(" release(%0):at" : : "r"(next)); - asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next)); -} - -static inline unsigned int dmpoll(void) { - unsigned int ret = 0; - asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory"); - return ret; -} - -static inline unsigned int dmwait(void) { - unsigned int ret = 0; - asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory"); - return ret; -} - -static inline dma_ptr dma_make_ptr(void *dst, const void *src) -{ - dma_ptr p = { dst, src }; - return p; -} - -static inline bool dma_queue_push(dma_queue * q, - dma_ptr dptr, - size_t dst_row_size, - size_t src_row_size, - size_t width, // width in bytes. number of bytes to transfer per row - size_t nrows) { - if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { - FARF(ERROR, "dma-push: queue full\n"); - return false; - } - - hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx]; - - desc->next = NULL; - desc->length = 0; - desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1; - desc->dstbypass = 1; - desc->srcbypass = 1; -#if __HVX_ARCH__ >= 73 - desc->dstbypass = 1; - desc->srcbypass = 1; -#else - desc->dstbypass = 0; - desc->srcbypass = 1; -#endif - desc->order = 0; - desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE; - desc->src = (void *) dptr.src; - desc->dst = (void *) dptr.dst; - desc->allocation = 0; - desc->padding = 0; - desc->roiwidth = width; - desc->roiheight = nrows; - desc->srcstride = src_row_size; - desc->dststride = dst_row_size; - desc->srcwidthoffset = 0; - desc->dstwidthoffset = 0; - - q->dptr[q->push_idx] = dptr; - - dmlink(q->tail, desc); - q->tail = desc; - - // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src); - q->push_idx = (q->push_idx + 1) & q->idx_mask; - return true; -} - -static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, - dma_ptr dptr, - size_t dst_row_size, - size_t src_row_size, - size_t nrows) { - return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows); -} - - -static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, - dma_ptr dptr, - size_t dst_row_size, - size_t src_row_size, - size_t nrows) { - return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows); -} - -static inline dma_ptr dma_queue_pop(dma_queue * q) { - dma_ptr dptr = { NULL }; - - if (q->push_idx == q->pop_idx) { - return dptr; - } - - hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx]; - - // Wait for desc to complete - while (1) { - dmpoll(); - if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) { - break; - } - // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); - } - - dptr = q->dptr[q->pop_idx]; - - // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst); - q->pop_idx = (q->pop_idx + 1) & q->idx_mask; - return dptr; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif /* HTP_DMA_H */ diff --git a/src/ggml-hexagon/htp/htp-msg.h b/src/ggml-hexagon/htp/htp-msg.h index 846d0617..f49e8ee4 100644 --- a/src/ggml-hexagon/htp/htp-msg.h +++ b/src/ggml-hexagon/htp/htp-msg.h @@ -63,6 +63,7 @@ enum htp_op { HTP_OP_SET_ROWS = 15, HTP_OP_SCALE = 16, HTP_OP_GET_ROWS = 17, + HTP_OP_CPY = 18, INVALID }; diff --git a/src/ggml-hexagon/htp/htp-ops.h b/src/ggml-hexagon/htp/htp-ops.h index 7c828ae6..602a2775 100644 --- a/src/ggml-hexagon/htp/htp-ops.h +++ b/src/ggml-hexagon/htp/htp-ops.h @@ -4,11 +4,12 @@ #include "htp-ctx.h" #include "htp-msg.h" #include "worker-pool.h" -#include "ops-utils.h" #include #include +#include + // ggml-common.h must be included prior to this header struct htp_spad { @@ -74,6 +75,14 @@ struct htp_ops_context { struct fastdiv_values get_rows_div_ne10; // fastdiv values for ne10 struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11 + struct fastdiv_values cpy_div_ne01; // fastdiv values for ne01 + struct fastdiv_values cpy_div_ne02; // fastdiv values for ne02 + struct fastdiv_values cpy_div_ne03; // fastdiv values for ne03 + + struct fastdiv_values cpy_rshp_div_n0; // fastdiv values for ne00 + struct fastdiv_values cpy_rshp_div_n1n0; // fastdiv values for ne00*ne01 + struct fastdiv_values cpy_rshp_div_n2n1n0; // fastdiv values for ne00*ne01*ne02 + uint32_t flags; }; @@ -88,5 +97,6 @@ int op_rope(struct htp_ops_context * octx); int op_flash_attn_ext(struct htp_ops_context * octx); int op_set_rows(struct htp_ops_context * octx); int op_get_rows(struct htp_ops_context * octx); +int op_cpy(struct htp_ops_context * octx); #endif /* HTP_OPS_H */ diff --git a/src/ggml-hexagon/htp/hvx-arith.h b/src/ggml-hexagon/htp/hvx-arith.h new file mode 100644 index 00000000..3449739a --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-arith.h @@ -0,0 +1,457 @@ +#ifndef HVX_ARITH_H +#define HVX_ARITH_H + +#include +#include +#include +#include + +#include "hvx-base.h" +#include "hex-utils.h" + +// +// Binary operations (add, mul, sub) +// + +#define hvx_arith_loop_body(dst_type, src0_type, src1_type, vec_store, vec_op) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src0_type * restrict vsrc0 = (src0_type *) src0; \ + src1_type * restrict vsrc1 = (src1_type *) src1; \ + \ + const uint32_t elem_size = sizeof(float); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + vdst[i] = vec_op(vsrc0[i], vsrc1[i]); \ + } \ + if (nloe) { \ + HVX_Vector v = vec_op(vsrc0[i], vsrc1[i]); \ + vec_store((void *) &vdst[i], nloe * elem_size, v); \ + } \ + } while(0) + +#if __HVX_ARCH__ < 79 +#define HVX_OP_ADD(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b)) +#define HVX_OP_SUB(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b)) +#define HVX_OP_MUL(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)) +#else +#define HVX_OP_ADD(a, b) Q6_Vsf_vadd_VsfVsf(a, b) +#define HVX_OP_SUB(a, b) Q6_Vsf_vsub_VsfVsf(a, b) +#define HVX_OP_MUL(a, b) Q6_Vsf_vmpy_VsfVsf(a, b) +#endif + +// ADD variants + +static inline void hvx_add_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD); +} + +static inline void hvx_add_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD); +} + +static inline void hvx_add_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD); +} + +static inline void hvx_add_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD); +} + +// SUB variants + +static inline void hvx_sub_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB); +} + +static inline void hvx_sub_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB); +} + +static inline void hvx_sub_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB); +} + +static inline void hvx_sub_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB); +} + +// MUL variants + +static inline void hvx_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL); +} + +static inline void hvx_mul_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL); +} + +static inline void hvx_mul_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL); +} + +static inline void hvx_mul_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) { + hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL); +} + +// Dispatchers + +static inline void hvx_add_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) { + if (hex_is_aligned((void *) src1, 128)) { + hvx_add_f32_aa(dst, src0, src1, num_elems); + } else { + hvx_add_f32_au(dst, src0, src1, num_elems); + } + } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) { + hvx_add_f32_ua(dst, src0, src1, num_elems); + } else { + hvx_add_f32_uu(dst, src0, src1, num_elems); + } +} + +static inline void hvx_sub_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) { + if (hex_is_aligned((void *) src1, 128)) { + hvx_sub_f32_aa(dst, src0, src1, num_elems); + } else { + hvx_sub_f32_au(dst, src0, src1, num_elems); + } + } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) { + hvx_sub_f32_ua(dst, src0, src1, num_elems); + } else { + hvx_sub_f32_uu(dst, src0, src1, num_elems); + } +} + +static inline void hvx_mul_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) { + if (hex_is_aligned((void *) src1, 128)) { + hvx_mul_f32_aa(dst, src0, src1, num_elems); + } else { + hvx_mul_f32_au(dst, src0, src1, num_elems); + } + } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) { + hvx_mul_f32_ua(dst, src0, src1, num_elems); + } else { + hvx_mul_f32_uu(dst, src0, src1, num_elems); + } +} + +// Mul-Mul Optimized + +static inline void hvx_mul_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint8_t * restrict src2, const uint32_t num_elems) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src0 % 128 == 0); + assert((unsigned long) src1 % 128 == 0); + assert((unsigned long) src2 % 128 == 0); + + HVX_Vector * restrict vdst = (HVX_Vector *) dst; + HVX_Vector * restrict vsrc0 = (HVX_Vector *) src0; + HVX_Vector * restrict vsrc1 = (HVX_Vector *) src1; + HVX_Vector * restrict vsrc2 = (HVX_Vector *) src2; + + const uint32_t elem_size = sizeof(float); + const uint32_t epv = 128 / elem_size; + const uint32_t nvec = num_elems / epv; + const uint32_t nloe = num_elems % epv; + + uint32_t i = 0; + + _Pragma("unroll(4)") + for (; i < nvec; i++) { + HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]); + vdst[i] = HVX_OP_MUL(v1, vsrc2[i]); + } + + if (nloe) { + HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]); + HVX_Vector v2 = HVX_OP_MUL(v1, vsrc2[i]); + hvx_vec_store_a((void *) &vdst[i], nloe * elem_size, v2); + } +} + +// Scalar Operations + +#define hvx_scalar_loop_body(dst_type, src_type, vec_store, scalar_op_macro) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const uint32_t elem_size = sizeof(float); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + HVX_Vector v = vsrc[i]; \ + vdst[i] = scalar_op_macro(v); \ + } \ + if (nloe) { \ + HVX_Vector v = vsrc[i]; \ + v = scalar_op_macro(v); \ + vec_store((void *) &vdst[i], nloe * elem_size, v); \ + } \ + } while(0) + +#define HVX_OP_ADD_SCALAR(v) \ + ({ \ + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v); \ + HVX_Vector out = HVX_OP_ADD(v, val_vec); \ + Q6_V_vmux_QVV(pred_inf, inf, out); \ + }) + +#define HVX_OP_MUL_SCALAR(v) HVX_OP_MUL(v, val_vec) +#define HVX_OP_SUB_SCALAR(v) HVX_OP_SUB(v, val_vec) + +// Add Scalar Variants + +static inline void hvx_add_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + const HVX_Vector inf = hvx_vec_splat_f32(INFINITY); + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD_SCALAR); +} + +static inline void hvx_add_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + const HVX_Vector inf = hvx_vec_splat_f32(INFINITY); + assert((unsigned long) dst % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD_SCALAR); +} + +static inline void hvx_add_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + const HVX_Vector inf = hvx_vec_splat_f32(INFINITY); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD_SCALAR); +} + +static inline void hvx_add_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + static const float kInf = INFINITY; + const HVX_Vector inf = hvx_vec_splat_f32(kInf); + hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD_SCALAR); +} + +// Sub Scalar Variants + +static inline void hvx_sub_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB_SCALAR); +} + +static inline void hvx_sub_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB_SCALAR); +} + +static inline void hvx_sub_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB_SCALAR); +} + +static inline void hvx_sub_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB_SCALAR); +} + +// Mul Scalar Variants + +static inline void hvx_mul_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL_SCALAR); +} + +static inline void hvx_mul_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL_SCALAR); +} + +static inline void hvx_mul_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL_SCALAR); +} + +static inline void hvx_mul_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL_SCALAR); +} + +static inline void hvx_add_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { + hvx_add_scalar_f32_aa(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) dst, 128)) { + hvx_add_scalar_f32_au(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) src, 128)) { + hvx_add_scalar_f32_ua(dst, src, val, num_elems); + } else { + hvx_add_scalar_f32_uu(dst, src, val, num_elems); + } +} + +static inline void hvx_mul_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { + hvx_mul_scalar_f32_aa(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) dst, 128)) { + hvx_mul_scalar_f32_au(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) src, 128)) { + hvx_mul_scalar_f32_ua(dst, src, val, num_elems); + } else { + hvx_mul_scalar_f32_uu(dst, src, val, num_elems); + } +} + +static inline void hvx_sub_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { + hvx_sub_scalar_f32_aa(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) dst, 128)) { + hvx_sub_scalar_f32_au(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) src, 128)) { + hvx_sub_scalar_f32_ua(dst, src, val, num_elems); + } else { + hvx_sub_scalar_f32_uu(dst, src, val, num_elems); + } +} + +// MIN Scalar variants + +#define HVX_OP_MIN_SCALAR(v) Q6_Vsf_vmin_VsfVsf(val_vec, v) + +static inline void hvx_min_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MIN_SCALAR); +} + +static inline void hvx_min_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) dst % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MIN_SCALAR); +} + +static inline void hvx_min_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MIN_SCALAR); +} + +static inline void hvx_min_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) { + const HVX_Vector val_vec = hvx_vec_splat_f32(val); + hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MIN_SCALAR); +} + +static inline void hvx_min_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { + hvx_min_scalar_f32_aa(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) dst, 128)) { + hvx_min_scalar_f32_au(dst, src, val, num_elems); + } else if (hex_is_aligned((void *) src, 128)) { + hvx_min_scalar_f32_ua(dst, src, val, num_elems); + } else { + hvx_min_scalar_f32_uu(dst, src, val, num_elems); + } +} + +// CLAMP Scalar variants + +#define HVX_OP_CLAMP_SCALAR(v) \ + ({ \ + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(v, max_vec); \ + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(min_vec, v); \ + HVX_Vector tmp = Q6_V_vmux_QVV(pred_cap_right, max_vec, v); \ + Q6_V_vmux_QVV(pred_cap_left, min_vec, tmp); \ + }) + +static inline void hvx_clamp_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) { + const HVX_Vector min_vec = hvx_vec_splat_f32(min); + const HVX_Vector max_vec = hvx_vec_splat_f32(max); + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR); +} + +static inline void hvx_clamp_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) { + const HVX_Vector min_vec = hvx_vec_splat_f32(min); + const HVX_Vector max_vec = hvx_vec_splat_f32(max); + assert((unsigned long) dst % 128 == 0); + hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR); +} + +static inline void hvx_clamp_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) { + const HVX_Vector min_vec = hvx_vec_splat_f32(min); + const HVX_Vector max_vec = hvx_vec_splat_f32(max); + assert((unsigned long) src % 128 == 0); + hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR); +} + +static inline void hvx_clamp_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) { + const HVX_Vector min_vec = hvx_vec_splat_f32(min); + const HVX_Vector max_vec = hvx_vec_splat_f32(max); + hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR); +} + +static inline void hvx_clamp_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, const int num_elems) { + if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { + hvx_clamp_scalar_f32_aa(dst, src, min, max, num_elems); + } else if (hex_is_aligned((void *) dst, 128)) { + hvx_clamp_scalar_f32_au(dst, src, min, max, num_elems); + } else if (hex_is_aligned((void *) src, 128)) { + hvx_clamp_scalar_f32_ua(dst, src, min, max, num_elems); + } else { + hvx_clamp_scalar_f32_uu(dst, src, min, max, num_elems); + } +} + +#undef HVX_OP_ADD +#undef HVX_OP_SUB +#undef HVX_OP_MUL +#undef hvx_arith_loop_body +#undef HVX_OP_ADD_SCALAR +#undef HVX_OP_SUB_SCALAR +#undef HVX_OP_MUL_SCALAR +#undef hvx_scalar_loop_body +#undef HVX_OP_MIN_SCALAR +#undef HVX_OP_CLAMP_SCALAR + +#endif // HVX_ARITH_H diff --git a/src/ggml-hexagon/htp/hvx-base.h b/src/ggml-hexagon/htp/hvx-base.h new file mode 100644 index 00000000..ffa6e18e --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-base.h @@ -0,0 +1,167 @@ +#ifndef HVX_BASE_H +#define HVX_BASE_H + +#include +#include + +#include "hex-utils.h" +#include "hvx-types.h" + +static inline void hvx_vec_store_u(void * restrict dst, uint32_t n, HVX_Vector v) { + // Rotate as needed. + v = Q6_V_vlalign_VVR(v, v, (size_t) dst); + + uint32_t left_off = (size_t) dst & 127; + uint32_t right_off = left_off + n; + + HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) dst); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + if (right_off > 128) { + Q6_vmem_QRIV(qr, (HVX_Vector *) dst + 1, v); + // all 1's + qr = Q6_Q_vcmp_eq_VbVb(v, v); + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, (HVX_Vector *) dst, v); +} + +static inline void hvx_vec_store_a(void * restrict dst, uint32_t n, HVX_Vector v) { + assert((unsigned long) dst % 128 == 0); + HVX_VectorPred m = Q6_Q_or_QQn(Q6_Q_vsetq_R((unsigned long) dst), Q6_Q_vsetq2_R(n)); + Q6_vmem_QnRIV(m, (HVX_Vector *) dst, v); +} + +static inline HVX_Vector hvx_vec_splat_f32(float v) { + union { float f; uint32_t i; } u = { .f = v }; + return Q6_V_vsplat_R(u.i); +} + +static inline HVX_Vector hvx_vec_splat_f16(float v) { + union { __fp16 f; uint16_t i; } u = { .f = v }; + return Q6_Vh_vsplat_R(u.i); +} + +static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) { + // vdelta control to replicate first 4 bytes across all elements + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + + HVX_Vector ctrl = *(HVX_Vector *) repl; + return Q6_V_vdelta_VV(v, ctrl); +} + +static inline float hvx_vec_get_f32(HVX_Vector v) { + float __attribute__((aligned(128))) x; + hvx_vec_store_a(&x, 4, v); + return x; +} + +static inline HVX_Vector hvx_vec_abs_f16(HVX_Vector v) { + // abs by clearing the fp16 sign bit + HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff); + return Q6_V_vand_VV(v, mask); +} + +static inline HVX_Vector hvx_vec_neg_f16(HVX_Vector v) { + // neg by setting the fp16 sign bit + HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); + return Q6_V_vxor_VV(v, mask); +} + +static inline HVX_Vector hvx_vec_abs_f32(HVX_Vector v) { + // abs by clearing the fp32 sign bit + HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff); + return Q6_V_vand_VV(v, mask); +} + +static inline HVX_Vector hvx_vec_neg_f32(HVX_Vector v) { +#if __HVX_ARCH__ > 75 + return Q6_Vsf_vfneg_Vsf(v); +#else + // neg by setting the fp32 sign bit + HVX_Vector mask = Q6_V_vsplat_R(0x80000000); + return Q6_V_vxor_VV(v, mask); +#endif // __HVX_ARCH__ > 75 +} + +static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) { + const HVX_Vector vnan_exp = Q6_Vh_vsplat_R(0x7C00); + const HVX_Vector vnan_frac = Q6_Vh_vsplat_R(0x7FFF); + + // get pred of which are NaN, i.e., exponent bits all 1s and fraction bits non 0s + HVX_VectorPred p_exp = Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_exp), vnan_exp); + HVX_VectorPred p_frac = Q6_Q_not_Q(Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_frac), vnan_exp)); + return Q6_Q_and_QQ(p_exp, p_frac); +} + +static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) { + const HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero); + HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero); + HVX_Vector v = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0))); + +#if __HVX_ARCH__ < 79 + // replace NaNs with -INF, older arches produce NaNs for (-INF + 0.0) + const HVX_Vector neg_inf = hvx_vec_splat_f16(-INFINITY); + HVX_VectorPred nan = hvx_vec_is_nan_f16(v); + v = Q6_V_vmux_QVV(nan, neg_inf, v); +#endif + + return v; +} + +/* Q6_Vsf_equals_Vw is only available on v73+.*/ +#if __HVX_ARCH__ < 73 +static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in) +{ + HVX_Vector const vzero = Q6_V_vzero(); + HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); + HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); + HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); + HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); + HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); + HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); + return ret; +} + +static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) +{ + return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in)); +} +#endif + +static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) { + // This looks complicated. + // Ideally should just be Q6_Vh_equals_Vhf(vin) + // but that instruction does not do proper rounding. + + // convert to qf32, multiplying by 1.0 in the process. + HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00)); + + // 'in-range' values are +/32752. + // add 192K to it, convert to sf + HVX_Vector v192K = Q6_V_vsplat_R(0x48400000); + HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K)); + HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K)); + + // for in-range cases, result is {163858... 229360} so the exponent is always 144. + // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer. + // Start by <<10 to get the final 'sign' bit in bit 15... + vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10); + vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10); + + // now round down to 16 + return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0); +} + +#endif /* HVX_BASE_H */ diff --git a/src/ggml-hexagon/htp/hvx-copy.h b/src/ggml-hexagon/htp/hvx-copy.h new file mode 100644 index 00000000..6b617b76 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-copy.h @@ -0,0 +1,247 @@ +#ifndef HVX_COPY_H +#define HVX_COPY_H + +#include +#include +#include + +#include "hvx-base.h" + +#define hvx_splat_loop_body(dst_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + \ + uint32_t nvec = n / (128 / elem_size); \ + uint32_t nloe = n % (128 / elem_size); \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + vdst[i] = src; \ + } \ + if (nloe) { \ + vec_store((void *) &vdst[i], nloe * elem_size, src); \ + } \ + } while(0) + +static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) { + assert((unsigned long) dst % 128 == 0); + hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) { + hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u); +} + +static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) { + hvx_splat_a(dst, hvx_vec_splat_f32(v), n, sizeof(float)); +} + +static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) { + hvx_splat_u(dst, hvx_vec_splat_f32(v), n, sizeof(float)); +} + +static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) { + hvx_splat_u(dst, hvx_vec_splat_f16(v), n, sizeof(__fp16)); +} + +static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) { + hvx_splat_u(dst, hvx_vec_splat_f16(v), n, sizeof(__fp16)); +} + +#define hvx_copy_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { vdst[i] = vsrc[i]; } \ + if (nloe) { \ + vec_store((void *) &vdst[i], nloe * elem_size, vsrc[i]); \ + } \ + } while(0) + +// Generic copy routines +static inline void hvx_copy_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_copy_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_copy_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) { + assert((unsigned long) dst % 128 == 0); + hvx_copy_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +static inline void hvx_copy_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) { + assert((unsigned long) src % 128 == 0); + hvx_copy_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +static inline void hvx_copy_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) { + hvx_copy_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +// copy n fp16 elements : source and destination are aligned to HVX Vector (128) +static inline void hvx_copy_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_aa(dst, src, n, sizeof(__fp16)); +} + +// copy n fp16 elements : source is aligned, destination is potentially unaligned +static inline void hvx_copy_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_au(dst, src, n, sizeof(__fp16)); +} + +// copy n fp16 elements : source is aligned, destination is potentially unaligned +static inline void hvx_copy_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_ua(dst, src, n, sizeof(__fp16)); +} + +// copy n fp16 elements : source is aligned, destination is potentially unaligned +static inline void hvx_copy_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_uu(dst, src, n, sizeof(__fp16)); +} + +// copy n fp32 elements : source and destination are aligned to HVX Vector (128) +static inline void hvx_copy_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_aa(dst, src, n, sizeof(float)); +} + +// copy n fp32 elements : source is aligned, destination is unaligned +static inline void hvx_copy_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_ua(dst, src, n, sizeof(float)); +} + +// copy n fp32 elements : source is unaligned, destination is aligned +static inline void hvx_copy_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_au(dst, src, n, sizeof(float)); +} + +// copy n fp32 elements : source is unaligned, destination unaligned +static inline void hvx_copy_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_uu(dst, src, n, sizeof(float)); +} + +//// fp32 -> fp16 + +#define hvx_copy_f16_f32_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const HVX_Vector zero = Q6_V_vsplat_R(0); \ + \ + const uint32_t elem_size = sizeof(__fp16); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + vdst[i] = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]); \ + } \ + if (nloe) { \ + HVX_Vector v = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]); \ + vec_store((void *) &vdst[i], nloe * elem_size, v); \ + } \ + } while(0) + +// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is aligned +static inline void hvx_copy_f16_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned +static inline void hvx_copy_f16_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned +static inline void hvx_copy_f16_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) src % 128 == 0); + hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned +static inline void hvx_copy_f16_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +//// fp16 -> fp32 + +#define hvx_copy_f32_f16_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const HVX_Vector one = hvx_vec_splat_f16(1.0); \ + \ + const uint32_t elem_size = sizeof(__fp16); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (i = 0; i < nvec; ++i) { \ + HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \ + vdst[i*2] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p)); \ + vdst[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)); \ + } \ + \ + if (nloe) { \ + HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \ + \ + HVX_Vector vd = Q6_V_lo_W(p); \ + i = 2 * i; \ + \ + if (nloe >= 32) { \ + vdst[i] = Q6_Vsf_equals_Vqf32(vd); \ + nloe -= 32; ++i; vd = Q6_V_hi_W(p); \ + } \ + \ + if (nloe) { \ + vd = Q6_Vsf_equals_Vqf32(vd); \ + hvx_vec_store_u(&vdst[i], nloe * sizeof(float), vd); \ + } \ + } \ + } while(0) + +// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is aligned +static inline void hvx_copy_f32_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is aligned +static inline void hvx_copy_f32_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is unaligned +static inline void hvx_copy_f32_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) src % 128 == 0); + hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is unaligned +static inline void hvx_copy_f32_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +#endif // HVX_COPY_H diff --git a/src/ggml-hexagon/htp/hvx-dump.h b/src/ggml-hexagon/htp/hvx-dump.h new file mode 100644 index 00000000..e8822278 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-dump.h @@ -0,0 +1,132 @@ +#ifndef HVX_DUMP_H +#define HVX_DUMP_H + +#include + +#include +#include + +#include "hex-utils.h" +#include "hvx-types.h" + +static void hvx_vec_dump_f16_n(char * pref, HVX_Vector v, uint32_t n) { + HVX_VectorAlias u = { .v = v }; + + const uint32_t n0 = n / 16; + const uint32_t n1 = n % 16; + int i = 0; + for (; i < n0; i++) { + hex_dump_f16_line(pref, u.fp16 + (16 * i), 16); + } + if (n1) { + hex_dump_f16_line(pref, u.fp16 + (16 * i), n1); + } +} + +static void hvx_vec_dump_f16(char * pref, HVX_Vector v) { + hvx_vec_dump_f16_n(pref, v, 64); +} + +static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) { + union { + HVX_Vector v; + float d[32]; + } u = { .v = v }; + + const uint32_t n0 = n / 16; + const uint32_t n1 = n % 16; + int i = 0; + for (; i < n0; i++) { + hex_dump_f32_line(pref, u.d + (16 * i), 16); + } + if (n1) { + hex_dump_f32_line(pref, u.d + (16 * i), n1); + } +} + +static void hvx_vec_dump_f32_hmt(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + float d[32]; + } u = { .v = v }; + + FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1], + u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]); +} + +static void hvx_vec_dump_f32(char * pref, HVX_Vector v) { + hvx_vec_dump_f32_n(pref, v, 32); +} + +static void hvx_vec_dump_int32(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + int32_t d[32]; + } u = { .v = v }; + + for (int i = 0; i < 32 / 16; i++) { + hex_dump_int32_line(pref, u.d + (16 * i), 16); + } +} + +static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + int32_t d[32]; + } u = { .v = v }; + + FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12], + u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]); +} + +static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + int8_t d[128]; + } u = { .v = v }; + + FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60], + u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]); +} + +static void hvx_vec_dump_int8(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + int8_t d[128]; + } u = { .v = v }; + + for (int i = 0; i < 128 / 16; i++) { + hex_dump_int8_line(pref, u.d + (16 * i), 16); + } +} + +static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) { + union { + HVX_Vector v; + uint8_t d[128]; + } u = { .v = v }; + + for (int i = 0; i < 128 / 16; i++) { + hex_dump_uint8_line(pref, u.d + (16 * i), 16); + } +} + +static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) { + typedef union { + HVX_Vector v; + int8_t d[128]; + } U; + + U u0 = { .v = v0 }; + U u1 = { .v = v1 }; + + for (int i = 0; i < n; i++) { + if (u0.d[i] != u1.d[i]) { + return false; + } + } + + return true; +} + +#endif /* HVX_DUMP_H */ diff --git a/src/ggml-hexagon/htp/hvx-exp.c b/src/ggml-hexagon/htp/hvx-exp.c deleted file mode 100644 index 21bf46a5..00000000 --- a/src/ggml-hexagon/htp/hvx-exp.c +++ /dev/null @@ -1,94 +0,0 @@ -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" - -#include -#include -#include -#include - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" -#include "htp-ctx.h" -#include "htp-dma.h" -#include "htp-msg.h" -#include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" - -static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) { - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); - - HVX_Vector out = hvx_vec_exp_fp32(in_vec); - - return Q6_V_vmux_QVV(pred0, inf, out); -} - -void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - // assert((0 == unaligned_addr) || (0 == num_elems_whole)); - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - HVX_Vector vec_out = Q6_V_vzero(); - - static const float kInf = INFINITY; - static const float kMaxExp = 88.02f; // log(INF) - - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - - if (0 == unaligned_loop) { - HVX_Vector * p_vec_in1 = (HVX_Vector *) src; - HVX_Vector * p_vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - if (true == negate) { - HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); - } else { - *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf); - } - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - if (true == negate) { - HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); - } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf); - } - } - } - - if (left_over > 0) { - const float * srcf = (float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - - if (true == negate) { - HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - - vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); - } else { - vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf); - } - - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); - } -} diff --git a/src/ggml-hexagon/htp/hvx-exp.h b/src/ggml-hexagon/htp/hvx-exp.h new file mode 100644 index 00000000..44dfe232 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-exp.h @@ -0,0 +1,215 @@ +#ifndef HVX_EXP_H +#define HVX_EXP_H + +#include +#include + +#include "hvx-base.h" +#include "hvx-floor.h" + +#define EXP_COEFF_5 (0x39506967) // 0.000198757 = 1/(7!) +#define EXP_COEFF_4 (0x3AB743CE) // 0.0013982 = 1/(6!) +#define EXP_COEFF_3 (0x3C088908) // 0.00833345 = 1/(5!) +#define EXP_COEFF_2 (0x3D2AA9C1) // 0.416658 = 1/(4!) +#define EXP_COEFF_1 (0x3E2AAAAA) // 0.16666667 = 1/(3!) +#define EXP_COEFF_0 (0x3F000000) // 0.5 = 1/(2!) +#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805 +#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408 +#define EXP_ONE (0x3f800000) // 1.0 +#define EXP_RANGE_R (0x41a00000) // 20.0 +#define EXP_RANGE_L (0xc1a00000) // -20.0 + +static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) { + HVX_Vector z_qf32_v; + HVX_Vector x_v; + HVX_Vector x_qf32_v; + HVX_Vector y_v; + HVX_Vector k_v; + HVX_Vector f_v; + HVX_Vector epsilon_v; + HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E); + HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2); + HVX_Vector E_const; + HVX_Vector zero_v = Q6_V_vzero(); + + // exp(x) is approximated as follows: + // f = floor(x/ln(2)) = floor(x*log2(e)) + // epsilon = x - f*ln(2) + // exp(x) = exp(epsilon+f*ln(2)) + // = exp(epsilon)*exp(f*ln(2)) + // = exp(epsilon)*2^f + // + // Since epsilon is close to zero, it can be approximated with its Taylor series: + // exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+... + // Preserving the first eight elements, we get: + // exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7 + // = 1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2 + + HVX_Vector temp_v = in_vec; + + // Clamp inputs to (-20.0, 20.0) + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R)); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec); + + in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v); + in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v); + + epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec); + epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v); + + // f_v is the floating point result and k_v is the integer result + f_v = hvx_vec_floor_f32(epsilon_v); + k_v = hvx_vec_truncate_f32(f_v); + + x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v); + + // x = x - f_v * logn2; + epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2); + x_qf32_v = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v); + // normalize before every QFloat's vmpy + x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v); + + // z = x * x; + z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v); + z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v); + + x_v = Q6_Vsf_equals_Vqf32(x_qf32_v); + + // y = E4 + E5 * x; + E_const = Q6_V_vsplat_R(EXP_COEFF_5); + y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v); + E_const = Q6_V_vsplat_R(EXP_COEFF_4); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = E3 + y * x; + E_const = Q6_V_vsplat_R(EXP_COEFF_3); + y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = E2 + y * x; + E_const = Q6_V_vsplat_R(EXP_COEFF_2); + y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = E1 + y * x; + E_const = Q6_V_vsplat_R(EXP_COEFF_1); + y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = E0 + y * x; + E_const = Q6_V_vsplat_R(EXP_COEFF_0); + y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = x + y * z; + y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v); + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); + + // y = y + 1.0; + y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE)); + + // insert exponents + // y = ldexpf(y, k); + // y_v += k_v; // qf32 + // modify exponent + + y_v = Q6_Vsf_equals_Vqf32(y_v); + + // add k_v to the exponent of y_v + HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1); + + y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1); + y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent); + + // exponent cannot be negative; if overflow is detected, result is set to zero + HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent); + + y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN); + + y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v); + + return y_v; +} + +static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) { + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); + + HVX_Vector out = hvx_vec_exp_f32(in_vec); + + return Q6_V_vmux_QVV(pred0, inf, out); +} + +static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { + int left_over = num_elems & (VLEN_FP32 - 1); + int num_elems_whole = num_elems - left_over; + + int unaligned_addr = 0; + int unaligned_loop = 0; + if ((0 == hex_is_aligned((void *) src, VLEN)) || (0 == hex_is_aligned((void *) dst, VLEN))) { + unaligned_addr = 1; + } + // assert((0 == unaligned_addr) || (0 == num_elems_whole)); + if ((1 == unaligned_addr) && (num_elems_whole != 0)) { + unaligned_loop = 1; + } + + HVX_Vector vec_out = Q6_V_vzero(); + + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_f32(kInf); + + if (0 == unaligned_loop) { + HVX_Vector * p_vec_in1 = (HVX_Vector *) src; + HVX_Vector * p_vec_out = (HVX_Vector *) dst; + + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + if (true == negate) { + HVX_Vector neg_vec_in = hvx_vec_neg_f32(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf); + } else { + *p_vec_out++ = hvx_vec_exp_f32_guard(*p_vec_in1++, max_exp, inf); + } + } + } else { + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); + + if (true == negate) { + HVX_Vector neg_vec_in = hvx_vec_neg_f32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf); + } else { + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(in, max_exp, inf); + } + } + } + + if (left_over > 0) { + const float * srcf = (float *) src + num_elems_whole; + float * dstf = (float *) dst + num_elems_whole; + + HVX_Vector in = *(HVX_UVector *) srcf; + + if (true == negate) { + HVX_Vector neg_vec_in = hvx_vec_neg_f32(in); + + vec_out = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf); + } else { + vec_out = hvx_vec_exp_f32_guard(in, max_exp, inf); + } + + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); + } +} + +#endif /* HVX_EXP_H */ diff --git a/src/ggml-hexagon/htp/hvx-floor.h b/src/ggml-hexagon/htp/hvx-floor.h new file mode 100644 index 00000000..6a1bfde5 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-floor.h @@ -0,0 +1,100 @@ +#ifndef HVX_FLOOR_H +#define HVX_FLOOR_H + +#include +#include + +#include "hvx-base.h" + +#define IEEE_VSF_EXPLEN (8) +#define IEEE_VSF_EXPBIAS (127) +#define IEEE_VSF_EXPMASK (0xFF) +#define IEEE_VSF_MANTLEN (23) +#define IEEE_VSF_MANTMASK (0x7FFFFF) +#define IEEE_VSF_MIMPMASK (0x800000) + +static inline HVX_Vector hvx_vec_truncate_f32(HVX_Vector in_vec) { + HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK); + HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK); + HVX_Vector const_zero_v = Q6_V_vzero(); + + HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec); + + HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN; + expval_v &= IEEE_VSF_EXPMASK; + expval_v -= IEEE_VSF_EXPBIAS; + + // negative exp == fractional value + HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v); + + HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v; // fractional bits - exp shift + + HVX_Vector mant_v = in_vec & mask_mant_v; // obtain mantissa + HVX_Vector vout = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v); // add implicit 1.0 + + vout = Q6_Vw_vasr_VwVw(vout, rshift_v); // shift to obtain truncated integer + vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout); // expval<0 -> 0 + + HVX_Vector neg_vout = -vout; + + vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout); // handle negatives + + return (vout); +} + +static inline HVX_Vector hvx_vec_floor_f32(HVX_Vector in_vec) { + HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK); + HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK); + HVX_Vector const_mnlen_v = Q6_V_vsplat_R(IEEE_VSF_MANTLEN); + HVX_Vector const_zero_v = Q6_V_vzero(); + HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000); // -1 IEEE vsf + + HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec); + + HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN; + expval_v &= IEEE_VSF_EXPMASK; + expval_v -= IEEE_VSF_EXPBIAS; + + HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v); + HVX_VectorPred q_expltmn = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v); + HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v); + HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec); + + // if expval < 0 (q_negexp) // <0, floor is 0 + // if vin > 0 + // floor = 0 + // if vin < 0 + // floor = -1 + // if expval < mant_len (q_expltmn) // >0, but fraction may exist + // get sign (q_negative) + // mask >> expval // fraction bits to mask off + // vout = ~(mask) // apply mask to remove fraction + // if (qneg) // negative floor is one less (more, sign bit for neg) + // vout += ((impl_mask) >> expval) + // if (mask && vin) + // vout = vin + // else // already an integer + // ; // no change + + // compute floor + mask_mant_v >>= expval_v; + HVX_Vector neg_addin_v = mask_impl_v >> expval_v; + HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v); + HVX_Vector vout = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec); + + HVX_Vector mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v); // chk if bits set + HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v); + + HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v); // frac bits to clear + HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v); // clear frac bits + + vout = in_vec; + vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout); // expval0 -> 0 + vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout); // expval<0 x<0 -> -1 + + return vout; +} + +#endif /* HVX_FLOOR_H */ diff --git a/src/ggml-hexagon/htp/hvx-inverse.c b/src/ggml-hexagon/htp/hvx-inverse.c deleted file mode 100644 index 4d70634f..00000000 --- a/src/ggml-hexagon/htp/hvx-inverse.c +++ /dev/null @@ -1,72 +0,0 @@ -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" - -#include -#include -#include -#include - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" -#include "htp-ctx.h" -#include "htp-dma.h" -#include "htp-msg.h" -#include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" - -static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) { - HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - - HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); - const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); - - return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); -} - -void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - // assert((0 == unaligned_addr) || (0 == num_elems_whole)); - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - static const uint32_t kNanInfMask = 0x7f800000; - const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask); - - if (0 == unaligned_loop) { - HVX_Vector * p_vec_in = (HVX_Vector *) src; - HVX_Vector * p_vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); - } - } - - if (left_over > 0) { - const float * srcf = (float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); - - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); - } -} diff --git a/src/ggml-hexagon/htp/hvx-inverse.h b/src/ggml-hexagon/htp/hvx-inverse.h new file mode 100644 index 00000000..49f3efab --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-inverse.h @@ -0,0 +1,176 @@ +#ifndef HVX_INVERSE_H +#define HVX_INVERSE_H + +#include + +#include +#include +#include +#include +#include + +#include "hvx-base.h" + +// ==================================================== +// FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5 +// Order:3; continuity: True; Ends forced: True +// Mode: unsigned; Result fractional bits: 14 +// Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05 +// 32769 -32706 31252 -10589 +// 32590 -30635 22793 -4493 +// 32066 -27505 16481 -2348 +// 31205 -24054 11849 -1306 + +static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) { + // input is 0..0xffff representing 0.0 .. 1.0 + HVX_Vector p; + p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull); + p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull); + p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull); + p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull); + return p; // signed result, 14 fractional bits +} + +// Find reciprocal of fp16. +// (1) first, convert to fp32, multiplying by 1.0; this is done to +// handle denormals. Ignoring sign and zero, result should be at +// least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000) +// (exponent in range [103,143]) +// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly +// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32 +// (4) convert that to fp16 +// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace +// the result with the max value. +static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) { + HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF); + HVX_Vector avals = Q6_V_vand_VV(vals, em_mask); + HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals); + // is too small to 1/x ? for 'standard' fp16, this would be 0x101 + HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals); + + HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0 + HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32)); + HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32)); + + // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector + HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9)); + // likewise extract the upper 16 from each, containing the exponents in range 103..142 + HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0); + //Get exponent in IEEE 32-bit representation + exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7); + + // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane + // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0) + // Use poly to transform to 1/x, with 14 fractional bits + // + HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16); + + HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros + + // Get mantissa for 16-bit represenation + HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF)); + + //Compute Reciprocal Exponent + HVX_Vector exp_recip = + Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1))); + //Convert it for 16-bit representation + exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15)); + exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10); + + //Merge exponent and mantissa for reciprocal + HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip); + // map 'small' inputs to standard largest value 0x7bff + recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip); + // add sign back + recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000); + return recip; +} + +static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) { + HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3); + HVX_Vector two_sf = hvx_vec_splat_f32(2.0); + + // First approximation + HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf); + + HVX_Vector r_qf; + + // Refine + r_qf = Q6_Vqf32_vmpy_VsfVsf( + i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf))))); + r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( + r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); + r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( + r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); + + return Q6_Vsf_equals_Vqf32(r_qf); +} + +static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) { + HVX_Vector out = hvx_vec_inverse_f32(v_sf); + + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); + + return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); +} + +#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \ + \ + const uint32_t nvec = n / VLEN_FP32; \ + const uint32_t nloe = n % VLEN_FP32; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \ + } \ + if (nloe) { \ + HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \ + vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v); \ + } \ + } while(0) + +static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) src % 128 == 0); + hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) { + if ((unsigned long) dst % 128 == 0) { + if ((unsigned long) src % 128 == 0) { + hvx_inverse_f32_aa(dst, src, num_elems); + } else { + hvx_inverse_f32_au(dst, src, num_elems); + } + } else { + if ((unsigned long) src % 128 == 0) { + hvx_inverse_f32_ua(dst, src, num_elems); + } else { + hvx_inverse_f32_uu(dst, src, num_elems); + } + } +} + +#endif // HVX_INVERSE_H diff --git a/src/ggml-hexagon/htp/hvx-reduce.h b/src/ggml-hexagon/htp/hvx-reduce.h new file mode 100644 index 00000000..8845fe73 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-reduce.h @@ -0,0 +1,225 @@ +#ifndef HVX_REDUCE_H +#define HVX_REDUCE_H + +#include +#include +#include +#include + +#include "hex-utils.h" +#include "hvx-base.h" +#include "hvx-types.h" + +static inline HVX_Vector hvx_vec_reduce_sum_n_i32(HVX_Vector in, unsigned int n) { + unsigned int total = n * 4; // total vec nbytes + unsigned int width = 4; // int32 + + HVX_Vector sum = in, sum_t; + while (width < total) { + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vw_vadd_VwVw(sum_t, sum); // elementwise sum + width = width << 1; + } + return sum; +} + +static inline HVX_Vector hvx_vec_reduce_sum_i32(HVX_Vector in) { + return hvx_vec_reduce_sum_n_i32(in, 32); +} + +static inline HVX_Vector hvx_vec_reduce_sum_n_qf32(HVX_Vector in, unsigned int n) { + unsigned int total = n * 4; // total vec nbytes + unsigned int width = 4; // fp32 nbytes + + HVX_Vector sum = in, sum_t; + while (width < total) { + sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width); // rotate right + sum = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t); // elementwise sum + width = width << 1; + } + return sum; +} + +static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) { + return hvx_vec_reduce_sum_n_qf32(in, 32); +} + +static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) { + unsigned int total = n * 4; // total vec nbytes + unsigned int width = 4; // fp32 nbytes + + HVX_Vector sum = in, sum_t; + while (width < total) { + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum + width = width << 1; + } + return sum; +} + +static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) { + return hvx_vec_reduce_sum_n_f32(in, 32); +} + +static inline HVX_Vector hvx_vec_reduce_max_f16(HVX_Vector in) { + unsigned total = 128; // total vec nbytes + unsigned width = 2; // fp16 nbytes + + HVX_Vector _max = in, _max_t; + while (width < total) { + _max_t = Q6_V_vror_VR(_max, width); // rotate right + _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max + width = width << 1; + } + + return _max; +} + +static inline HVX_Vector hvx_vec_reduce_max2_f16(HVX_Vector in, HVX_Vector _max) { + unsigned total = 128; // total vec nbytes + unsigned width = 2; // fp32 nbytes + + HVX_Vector _max_t; + + _max = Q6_Vhf_vmax_VhfVhf(in, _max); + while (width < total) { + _max_t = Q6_V_vror_VR(_max, width); // rotate right + _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max + width = width << 1; + } + + return _max; +} + +static inline HVX_Vector hvx_vec_reduce_max_f32(HVX_Vector in) { + unsigned total = 128; // total vec nbytes + unsigned width = 4; // fp32 nbytes + + HVX_Vector _max = in, _max_t; + while (width < total) { + _max_t = Q6_V_vror_VR(_max, width); // rotate right + _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max + width = width << 1; + } + + return _max; +} + +static inline HVX_Vector hvx_vec_reduce_max2_f32(HVX_Vector in, HVX_Vector _max) { + unsigned total = 128; // total vec nbytes + unsigned width = 4; // fp32 nbytes + + HVX_Vector _max_t; + + _max = Q6_Vsf_vmax_VsfVsf(in, _max); + while (width < total) { + _max_t = Q6_V_vror_VR(_max, width); // rotate right + _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max + width = width << 1; + } + + return _max; +} + +#define hvx_reduce_loop_body(src_type, init_vec, pad_vec, vec_op, reduce_op, scalar_reduce) \ + do { \ + src_type * restrict vsrc = (src_type *) src; \ + HVX_Vector acc = init_vec; \ + \ + const uint32_t elem_size = sizeof(float); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = num_elems / epv; \ + const uint32_t nloe = num_elems % epv; \ + \ + uint32_t i = 0; \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + acc = vec_op(acc, vsrc[i]); \ + } \ + if (nloe) { \ + const float * srcf = (const float *) src + i * epv; \ + HVX_Vector in = *(HVX_UVector *) srcf; \ + HVX_Vector temp = Q6_V_valign_VVR(in, pad_vec, nloe * elem_size); \ + acc = vec_op(acc, temp); \ + } \ + HVX_Vector v = reduce_op(acc); \ + return scalar_reduce(v); \ + } while(0) + +#define HVX_REDUCE_MAX_OP(acc, val) Q6_Vsf_vmax_VsfVsf(acc, val) +#define HVX_REDUCE_SUM_OP(acc, val) Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(acc), val) +#define HVX_SUM_SQ_OP(acc, val) Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(val, val)) +#define HVX_REDUCE_MAX_SCALAR(v) hvx_vec_get_f32(v) +#define HVX_REDUCE_SUM_SCALAR(v) hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(v)) + +// Max variants + +static inline float hvx_reduce_max_f32_a(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]); + assert((unsigned long) src % 128 == 0); + hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR); +} + +static inline float hvx_reduce_max_f32_u(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]); + hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR); +} + +static inline float hvx_reduce_max_f32(const uint8_t * restrict src, const int num_elems) { + if (hex_is_aligned((void *) src, 128)) { + return hvx_reduce_max_f32_a(src, num_elems); + } else { + return hvx_reduce_max_f32_u(src, num_elems); + } +} + +// Sum variants + +static inline float hvx_reduce_sum_f32_a(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = Q6_V_vsplat_R(0); + assert((unsigned long) src % 128 == 0); + hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR); +} + +static inline float hvx_reduce_sum_f32_u(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = Q6_V_vsplat_R(0); + hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR); +} + +static inline float hvx_reduce_sum_f32(const uint8_t * restrict src, const int num_elems) { + if (hex_is_aligned((void *) src, 128)) { + return hvx_reduce_sum_f32_a(src, num_elems); + } else { + return hvx_reduce_sum_f32_u(src, num_elems); + } +} + +// Sum of squares variants + +static inline float hvx_sum_of_squares_f32_a(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = Q6_V_vsplat_R(0); + assert((uintptr_t) src % 128 == 0); + hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR); +} + +static inline float hvx_sum_of_squares_f32_u(const uint8_t * restrict src, const int num_elems) { + HVX_Vector init_vec = Q6_V_vsplat_R(0); + hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR); +} + +static inline float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) { + if (hex_is_aligned((void *) src, 128)) { + return hvx_sum_of_squares_f32_a(src, num_elems); + } else { + return hvx_sum_of_squares_f32_u(src, num_elems); + } +} + +#undef hvx_reduce_loop_body +#undef HVX_REDUCE_MAX_OP +#undef HVX_REDUCE_SUM_OP +#undef HVX_REDUCE_MAX_SCALAR +#undef HVX_REDUCE_SUM_SCALAR +#undef HVX_SUM_SQ_OP + +#endif /* HVX_REDUCE_H */ diff --git a/src/ggml-hexagon/htp/hvx-scale.h b/src/ggml-hexagon/htp/hvx-scale.h new file mode 100644 index 00000000..c65c9863 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-scale.h @@ -0,0 +1,133 @@ +#ifndef HVX_SCALE_H +#define HVX_SCALE_H + +#include +#include +#include + +#include "hvx-base.h" + +#define hvx_scale_f32_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + HVX_Vector vs = hvx_vec_splat_f32(scale); \ + \ + const uint32_t elem_size = sizeof(float); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; ++i) { \ + HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); \ + vdst[i] = Q6_Vsf_equals_Vqf32(v); \ + } \ + if (nloe) { \ + HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); \ + vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v)); \ + } \ + } while(0) + +static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { + assert((size_t) dst % 128 == 0); + assert((size_t) src % 128 == 0); + hvx_scale_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_scale_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { + assert((size_t) dst % 128 == 0); + hvx_scale_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +static inline void hvx_scale_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { + assert((size_t) src % 128 == 0); + hvx_scale_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { + hvx_scale_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { + if (((size_t) dst & 127) == 0) { + if (((size_t) src & 127) == 0) { + hvx_scale_f32_aa(dst, src, n, scale); + } else { + hvx_scale_f32_au(dst, src, n, scale); + } + } else { + if (((size_t) src & 127) == 0) { + hvx_scale_f32_ua(dst, src, n, scale); + } else { + hvx_scale_f32_uu(dst, src, n, scale); + } + } +} + +#define hvx_scale_offset_f32_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + HVX_Vector vs = hvx_vec_splat_f32(scale); \ + HVX_Vector vo = hvx_vec_splat_f32(offset); \ + \ + const uint32_t elem_size = sizeof(float); \ + const uint32_t epv = 128 / elem_size; \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; ++i) { \ + HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \ + vdst[i] = Q6_Vsf_equals_Vqf32(v); \ + } \ + if (nloe) { \ + HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \ + vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v)); \ + } \ + } while(0) + +static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { + assert((size_t) dst % 128 == 0); + assert((size_t) src % 128 == 0); + hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_scale_offset_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { + assert((size_t) dst % 128 == 0); + hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +static inline void hvx_scale_offset_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { + assert((size_t) src % 128 == 0); + hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { + hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { + if (((size_t) dst & 127) == 0) { + if (((size_t) src & 127) == 0) { + hvx_scale_offset_f32_aa(dst, src, n, scale, offset); + } else { + hvx_scale_offset_f32_au(dst, src, n, scale, offset); + } + } else { + if (((size_t) src & 127) == 0) { + hvx_scale_offset_f32_ua(dst, src, n, scale, offset); + } else { + hvx_scale_offset_f32_uu(dst, src, n, scale, offset); + } + } +} + +#endif // HVX_SCALE_H diff --git a/src/ggml-hexagon/htp/hvx-sigmoid.c b/src/ggml-hexagon/htp/hvx-sigmoid.c deleted file mode 100644 index 15ac6469..00000000 --- a/src/ggml-hexagon/htp/hvx-sigmoid.c +++ /dev/null @@ -1,49 +0,0 @@ -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" - -#include -#include -#include -#include - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" -#include "htp-ctx.h" -#include "htp-dma.h" -#include "htp-msg.h" -#include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" - -#if 0 -// Reference algo used in hvx-utils -static void fast_sigmoid_f32(const float* restrict src, float* restrict dst, const int num_elems) -{ - const float c1 = 0.03138777; - const float c2 = 0.276281267; - const float c_log2f = 1.442695022; - - int32_t store_ints[32]; - float store_floats[3][32]; - - for (int i = 0; i < num_elems; i++) - { - float v = src0[i]; - - v *= c_log2f*0.5; - int intPart = (int)v; - float x = (v - intPart); - float xx = x * x; - float v1 = c_log2f + c2 * xx; - float v2 = x + xx * c1 * x; - float v3 = (v2 + v1); - *((int*)&v3) += intPart << 24; - float v4 = v2 - v1; - float v5 = v3 - v4; - float res = v3 / v5; - - dst[i] = res; - } -} -#endif diff --git a/src/ggml-hexagon/htp/hvx-sigmoid.h b/src/ggml-hexagon/htp/hvx-sigmoid.h new file mode 100644 index 00000000..1b4aaff0 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-sigmoid.h @@ -0,0 +1,114 @@ +#ifndef HVX_SIGMOID_H +#define HVX_SIGMOID_H + +#include "hvx-base.h" + +#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 +#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 +#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 +#define FAST_SIGMOID_C3 (0x3f000000) // 0.5 + +static inline HVX_Vector hvx_vec_fast_sigmoid_f32(HVX_Vector v) { + v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F)); + v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3)); + + HVX_Vector in_int = hvx_vec_truncate_f32(Q6_Vsf_equals_Vqf32(v)); + HVX_Vector x = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int)); + HVX_Vector xx = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x); + + HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2)); + v1 = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F)); + + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1)); + v2 = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx); + v2 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x); + + HVX_Vector v3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1)); + HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1); + v3_exponent = Q6_Vuw_vlsr_VuwR(v3_exponent, 24); + v3_exponent = Q6_Vw_vadd_VwVw(in_int, v3_exponent); + v3 = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24); + + HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1)); + HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4)); + + HVX_Vector res = hvx_vec_inverse_f32(v5); + res = Q6_Vqf32_vmpy_VsfVsf(v3, res); + + return Q6_Vsf_equals_Vqf32(res); +} + +static inline HVX_Vector hvx_vec_fast_sigmoid_f32_guard(HVX_Vector v, + HVX_Vector one, + HVX_Vector max_exp, + HVX_Vector min_exp) { + const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); + const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); + + HVX_Vector out = hvx_vec_fast_sigmoid_f32(v); + out = Q6_V_vmux_QVV(pred_max, out, one); + return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero()); +} + +static inline HVX_Vector hvx_vec_tanh_f32(HVX_Vector x) { + // tanh(x) = 2 * sigmoid(2x) - 1 + HVX_Vector two = hvx_vec_splat_f32(2.0f); + HVX_Vector one = hvx_vec_splat_f32(1.0f); + HVX_Vector x2 = Q6_Vqf32_vmpy_VsfVsf(x, two); + + HVX_Vector max_exp = hvx_vec_splat_f32(87.f); + HVX_Vector min_exp = hvx_vec_splat_f32(-87.f); + + HVX_Vector sig2x = hvx_vec_fast_sigmoid_f32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp); + + HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two); + res = Q6_Vqf32_vsub_Vqf32Vsf(res, one); + return Q6_Vsf_equals_Vqf32(res); +} + +#define hvx_sigmoid_loop_body(dst_type, src_type, vec_store) \ + do { \ + dst_type * restrict vdst = (dst_type *) dst; \ + src_type * restrict vsrc = (src_type *) src; \ + \ + const HVX_Vector one = hvx_vec_splat_f32(1.f); \ + const HVX_Vector max_exp = hvx_vec_splat_f32(87.f); \ + const HVX_Vector min_exp = hvx_vec_splat_f32(-87.f); \ + \ + const uint32_t epv = 128 / sizeof(float); \ + const uint32_t nvec = n / epv; \ + const uint32_t nloe = n % epv; \ + \ + uint32_t i = 0; \ + \ + _Pragma("unroll(4)") \ + for (; i < nvec; i++) { \ + vdst[i] = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \ + } \ + if (nloe) { \ + HVX_Vector tmp = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \ + vec_store((void *) &vdst[i], nloe * sizeof(float), tmp); \ + } \ + } while(0) + +static inline void hvx_sigmoid_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + assert((unsigned long) src % 128 == 0); + hvx_sigmoid_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); +} + +static inline void hvx_sigmoid_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) dst % 128 == 0); + hvx_sigmoid_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); +} + +static inline void hvx_sigmoid_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + assert((unsigned long) src % 128 == 0); + hvx_sigmoid_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); +} + +static inline void hvx_sigmoid_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { + hvx_sigmoid_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); +} + +#endif /* HVX_SIGMOID_H */ diff --git a/src/ggml-hexagon/htp/hvx-sqrt.h b/src/ggml-hexagon/htp/hvx-sqrt.h new file mode 100644 index 00000000..28ee9f68 --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-sqrt.h @@ -0,0 +1,60 @@ +#ifndef HVX_SQRT_H +#define HVX_SQRT_H + +#include +#include + +#include "hex-utils.h" + +#include "hvx-base.h" + +#define RSQRT_CONST 0x5f3759df // Constant for fast inverse square root calculation +#define RSQRT_ONE_HALF 0x3f000000 // 0.5 +#define RSQRT_THREE_HALVES 0x3fc00000 // 1.5 + +static inline HVX_Vector hvx_vec_rsqrt_f32(HVX_Vector in_vec) { + //Algorithm : + // x2 = input*0.5 + // y = * (long *) &input + // y = 0x5f3759df - (y>>2) + // y = y*(threehalfs - x2*y*y) + + HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST); + HVX_Vector onehalf = Q6_V_vsplat_R(RSQRT_ONE_HALF); + HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES); + + HVX_Vector x2, y, ypower2, temp; + + x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf); + x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero()); + + y = Q6_Vw_vasr_VwR(in_vec, 1); + y = Q6_Vw_vsub_VwVw(rsqrtconst, y); + + // 1st iteration + ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y); + ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); + temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); + temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); + temp = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp)); + + // 2nd iteration + y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero()); + ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y); + ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); + temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); + temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); + temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp); + + // 3rd iteration + y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero()); + ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y); + ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); + temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); + temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); + temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp); + + return Q6_Vsf_equals_Vqf32(temp); +} + +#endif /* HVX_SQRT_H */ diff --git a/src/ggml-hexagon/htp/hvx-types.h b/src/ggml-hexagon/htp/hvx-types.h new file mode 100644 index 00000000..d495a59f --- /dev/null +++ b/src/ggml-hexagon/htp/hvx-types.h @@ -0,0 +1,36 @@ +#ifndef HVX_TYPES_H +#define HVX_TYPES_H + +#include +#include + +#include + +#define SIZEOF_FP32 (4) +#define SIZEOF_FP16 (2) +#define VLEN (128) +#define VLEN_FP32 (VLEN / SIZEOF_FP32) +#define VLEN_FP16 (VLEN / SIZEOF_FP16) + +typedef union { + HVX_Vector v; + uint8_t b[VLEN]; + uint16_t h[VLEN_FP16]; + uint32_t w[VLEN_FP32]; + __fp16 fp16[VLEN_FP16]; + float fp32[VLEN_FP32]; +} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; + +typedef struct { + HVX_Vector v[2]; +} HVX_Vector_x2; + +typedef struct { + HVX_Vector v[4]; +} HVX_Vector_x4; + +typedef struct { + HVX_Vector v[8]; +} HVX_Vector_x8; + +#endif /* HVX_TYPES_H */ diff --git a/src/ggml-hexagon/htp/hvx-utils.c b/src/ggml-hexagon/htp/hvx-utils.c deleted file mode 100644 index 29d73b86..00000000 --- a/src/ggml-hexagon/htp/hvx-utils.c +++ /dev/null @@ -1,1020 +0,0 @@ -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" - -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" -#include "hvx-utils.h" - -#define htp_binary_ops_preamble \ - int step_of_4 = num_elems >> 7; \ - int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6; \ - int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5; \ - int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \ - \ - const uint8_t * restrict src0_curr = src0; \ - const uint8_t * restrict src1_curr = src1; \ - uint8_t * restrict dst_curr = dst; - -void hvx_mul_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || - (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - - bool handled_leftover = false; - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); - } - } else { - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover_size = left_over * sizeof(float); - - - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; - HVX_UVector * restrict vec_out = (HVX_UVector *) dst; - - HVX_Vector slinep; - HVX_Vector slinec; - HVX_Vector sline; - HVX_Vector sline2p; - HVX_Vector sline2c; - HVX_Vector sline2; - - slinep = *vec_in1++; - sline2p = *vec_in2++; - #pragma unroll(4) - for (int i = step_of_1 - 1; i > 0; i--) { - slinec = *vec_in1++; - sline2c = *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - - *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; - } - if (step_of_1 > 1) { - slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; - sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; - } - if (left_over > 0) { - slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); - hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); - handled_leftover = true; - } - } - - - if (left_over > 0 && !handled_leftover) { - const float * src0f = (const float *) src0 + num_elems_whole; - const float * src1f = (const float *) src1 + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in1 = *(HVX_UVector *) src0f; - HVX_Vector in2 = *(HVX_UVector *) src1f; - - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - } -} - -void hvx_mul_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - htp_binary_ops_preamble; - - for (int i = 0; i < step_of_4; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); - - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); - - HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); - - src0_curr += 4 * VLEN; - - HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b); - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); - - *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); - - HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b); - - src1_curr += 4 * VLEN; - - *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); - - dst_curr += 4 * VLEN; - } - - for (int i = 0; i < step_of_2; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - src0_curr += 2 * VLEN; - - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); - - src1_curr += 2 * VLEN; - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - dst_curr += 2 * VLEN; - } - - for (int i = 0; i < step_of_1; i++) { - HVX_Vector va = *(HVX_Vector *) src0_curr; - - src0_curr += VLEN; - - HVX_Vector vb = *(HVX_Vector *) src1_curr; - - src1_curr += VLEN; - - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); - - dst_curr += VLEN; - } - - if (remaining > 0) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); - hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); - } -} - -void hvx_mul_mul_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - const uint8_t * restrict src2, - uint8_t * restrict dst, - const int num_elems) { - const uint8_t * restrict src0_curr = src0; - const uint8_t * restrict src1_curr = src1; - const uint8_t * restrict src2_curr = src2; - uint8_t * restrict dst_curr = dst; - - int step_of_2 = num_elems >> 6; - int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5; - int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; - - for (int i = 0; i < step_of_2; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - HVX_Vector v1c = *(HVX_Vector *) src2_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN); - - src0_curr += 2 * VLEN; - - HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c); - - src1_curr += 2 * VLEN; - src2_curr += 2 * VLEN; - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - dst_curr += 2 * VLEN; - } - for (int i = 0; i < step_of_1; i++) { - HVX_Vector va = *(HVX_Vector *) src0_curr; - src0_curr += VLEN; - - HVX_Vector vb = *(HVX_Vector *) src1_curr; - src1_curr += VLEN; - - HVX_Vector vc = *(HVX_Vector *) src2_curr; - src2_curr += VLEN; - - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb); - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2); - dst_curr += VLEN; - } - if (remaining > 0) { - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr); - hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2)); - } -} - -void hvx_add_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || - (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); - - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2); - - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - } - } - - if (left_over > 0) { - const float * src0f = (const float *) src0 + num_elems_whole; - const float * src1f = (const float *) src1 + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in1 = *(HVX_UVector *) src0f; - HVX_Vector in2 = *(HVX_UVector *) src1f; - - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - } -} - -void hvx_add_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - htp_binary_ops_preamble; - - for (int i = 0; i < step_of_4; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); - - HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); - - HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); - - src0_curr += 4 * VLEN; - - HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b); - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); - - *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); - - HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b); - - src1_curr += 4 * VLEN; - - *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); - - dst_curr += 4 * VLEN; - } - for (int i = 0; i < step_of_2; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - src0_curr += 2 * VLEN; - - HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b); - - src1_curr += 2 * VLEN; - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - dst_curr += 2 * VLEN; - } - for (int i = 0; i < step_of_1; i++) { - HVX_Vector va = *(HVX_Vector *) src0_curr; - - src0_curr += VLEN; - - HVX_Vector vb = *(HVX_Vector *) src1_curr; - - src1_curr += VLEN; - - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); - - dst_curr += VLEN; - } - if (remaining > 0) { - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); - hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); - } -} - -void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - static const float kInf = INFINITY; - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *vec_in1++; - const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - v = Q6_Vsf_equals_Vqf32(v); - v = Q6_V_vmux_QVV(pred_inf, inf, v); - *vec_out++ = v; - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - out = Q6_Vsf_equals_Vqf32(out); - out = Q6_V_vmux_QVV(pred_inf, inf, out); - - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; - } - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - - const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - out = Q6_Vsf_equals_Vqf32(out); - out = Q6_V_vmux_QVV(pred_inf, inf, out); - - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); - } -} - -void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - bool handled_leftover = false; - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); - } - } else { - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover_size = left_over * sizeof(float); - - HVX_Vector * input_v_ptr = (HVX_Vector *) src; - HVX_UVector * output_v_ptr = (HVX_UVector *) dst; - - HVX_Vector slinep; - HVX_Vector slinec; - HVX_Vector sline; - - slinep = *input_v_ptr++; - - #pragma unroll(4) - for (int i = step_of_1 - 1; i > 0; i--) { - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - /* Prepare slinep for next iteration */ - slinep = slinec; - } - - if (step_of_1 > 0) { - slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - - slinep = slinec; - } - - if (leftover_size > 0) { - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - - HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); - handled_leftover = true; - } - } - - if (left_over > 0 && !handled_leftover) { - const float * srcf = (const float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - } -} - -void hvx_sub_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || - (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); - - HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2); - - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - } - } - - if (left_over > 0) { - const float * src0f = (const float *) src0 + num_elems_whole; - const float * src1f = (const float *) src1 + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in1 = *(HVX_UVector *) src0f; - HVX_Vector in2 = *(HVX_UVector *) src1f; - - HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - } -} - -void hvx_sub_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems) { - htp_binary_ops_preamble; - - for (int i = 0; i < step_of_4; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); - - HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); - - HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); - - src0_curr += 4 * VLEN; - - HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b); - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); - - *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); - - HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b); - - src1_curr += 4 * VLEN; - - *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); - - dst_curr += 4 * VLEN; - } - for (int i = 0; i < step_of_2; i++) { - HVX_Vector v1a = *(HVX_Vector *) src0_curr; - - HVX_Vector v1b = *(HVX_Vector *) src1_curr; - - HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); - - HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b); - - HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); - - src0_curr += 2 * VLEN; - - HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b); - - src1_curr += 2 * VLEN; - - *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); - - dst_curr += 2 * VLEN; - } - for (int i = 0; i < step_of_1; i++) { - HVX_Vector va = *(HVX_Vector *) src0_curr; - - src0_curr += VLEN; - - HVX_Vector vb = *(HVX_Vector *) src1_curr; - - src1_curr += VLEN; - - HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb); - - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); - - dst_curr += VLEN; - } - if (remaining > 0) { - HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); - hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); - } -} - -void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec); - - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - } - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - - HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - } -} - -float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); - } - - assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); - - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; - - HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000); - HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1); - sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v); - vec_in1++; - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - - HVX_Vector vec_left = *(HVX_UVector *) srcf; - - HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left); - HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32); - - sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp); - } - - HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc); - return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v)); -} - -float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); - HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); - - if (0 == unaligned_loop) { - HVX_Vector * vec_in = (HVX_Vector *) src; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++); - sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in); - } - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - - HVX_Vector vec_left = *(HVX_UVector *) srcf; - HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32); - // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp); - sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp); - } - - HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec); - return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v)); -} - -float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { - int left_over = num_elems & (VLEN_FP32 - 1); - int num_elems_whole = num_elems - left_over; - - int unaligned_addr = 0; - int unaligned_loop = 0; - if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); - unaligned_addr = 1; - } - - if ((1 == unaligned_addr) && (num_elems_whole != 0)) { - unaligned_loop = 1; - FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); - } - - HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); - HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]); - - if (0 == unaligned_loop) { - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++); - } - } else { - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in); - } - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - - HVX_Vector in = *(HVX_UVector *) srcf; - - HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32); - vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, temp); - } - - HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max); - return hvx_vec_get_fp32(v); -} - -void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - int unalign_address = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); - unalign_address = 1; - } - - const float * src_f = (const float *) src; - - HVX_Vector vec_min = hvx_vec_splat_fp32(val); - - if(unalign_address == 0){ - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); - *vec_out++ = (min_clamp); - } - }else{ - HVX_UVector * restrict vec_in = (HVX_Vector *) src; - HVX_UVector * restrict vec_out = (HVX_Vector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); - *vec_out++ = (min_clamp); - } - } - - if (left_over > 0 ) { - const float * srcf = (const float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_UVector in = *(HVX_UVector *) srcf; - - HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in); - - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp)); - } -} - -void hvx_clamp_scalar_f32(const uint8_t * restrict src, - const float limit_left, - const float limit_right, - uint8_t * restrict dst, - const int num_elems) { - size_t left_over = num_elems & (VLEN_FP32 - 1); - size_t num_elems_whole = num_elems - left_over; - - int unalign_address = 0; - if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); - unalign_address = 1; - } - - HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); - HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); - - if(unalign_address == 0){ - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - - - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in_vec = *vec_in++; - HVX_Vector temp_v = in_vec; - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); - - in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); - in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); - - *vec_out++ = in_vec; - } - - }else{ - - HVX_UVector * restrict vec_in = (HVX_UVector *) src; - HVX_UVector * restrict vec_out = (HVX_UVector *) dst; - - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in_vec = *vec_in++; - HVX_Vector temp_v = in_vec; - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); - - in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); - in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); - - *vec_out++ = in_vec; - } - - } - - if (left_over > 0) { - const float * srcf = (const float *) src + num_elems_whole; - float * dstf = (float *) dst + num_elems_whole; - - HVX_Vector in_vec = *(HVX_UVector *) srcf; - - HVX_Vector temp_v = in_vec; - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); - - in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); - in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); - - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec); - } -} - - diff --git a/src/ggml-hexagon/htp/hvx-utils.h b/src/ggml-hexagon/htp/hvx-utils.h index 22876e6d..7b79a5ea 100644 --- a/src/ggml-hexagon/htp/hvx-utils.h +++ b/src/ggml-hexagon/htp/hvx-utils.h @@ -1,1353 +1,17 @@ #ifndef HVX_UTILS_H #define HVX_UTILS_H -#include "ops-utils.h" - -#include -#include - -#define SIZEOF_FP32 (4) -#define SIZEOF_FP16 (2) -#define VLEN (128) -#define VLEN_FP32 (VLEN / SIZEOF_FP32) -#define VLEN_FP16 (VLEN / SIZEOF_FP16) - -typedef union { - HVX_Vector v; - uint8_t b[VLEN]; - uint16_t h[VLEN_FP16]; - uint32_t w[VLEN_FP32]; - __fp16 fp16[VLEN_FP16]; - float fp32[VLEN_FP32]; -} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; - -/* Q6_Vsf_equals_Vw is only available on v73+.*/ -#if __HVX_ARCH__ < 73 -static inline HVX_Vector int32_to_qfloat(HVX_Vector const in) -{ - HVX_Vector const vzero = Q6_V_vzero(); - HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); - HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); - HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); - HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); - HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); - HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); - return ret; -} - -static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) -{ - return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in)); -} -#endif - -static inline HVX_Vector hvx_vec_splat_fp32(float v) { - union { - float f; - uint32_t i; - } fp32 = { .f = v }; - - return Q6_V_vsplat_R(fp32.i); -} - -static inline HVX_Vector hvx_vec_splat_fp16(float v) { - union { - __fp16 f; - uint16_t i; - } fp16 = { .f = v }; - - return Q6_Vh_vsplat_R(fp16.i); -} - -static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) { - // Rotate as needed. - v = Q6_V_vlalign_VVR(v, v, (size_t) addr); - - uint32_t left_off = (size_t) addr & 127; - uint32_t right_off = left_off + n; - - HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr); - HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); - - if (right_off > 128) { - Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v); - // all 1's - qr = Q6_Q_vcmp_eq_VbVb(v, v); - } - - ql_not = Q6_Q_or_QQn(ql_not, qr); - Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v); -} - -static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) { - assert((unsigned long) ptr % 128 == 0); - - HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr); - HVX_VectorPred qr = Q6_Q_vsetq2_R(n); - ql_not = Q6_Q_or_QQn(ql_not, qr); - Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v); -} - -static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) { - // vdelta control to replicate first 4 bytes across all elements - static const uint8_t __attribute__((aligned(128))) repl[128] = { - 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, - }; - - HVX_Vector ctrl = *(HVX_Vector *) repl; - return Q6_V_vdelta_VV(v, ctrl); -} - -// copy n fp16 elements : source and destination are aligned to HVX Vector (128) -static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; - HVX_Vector * restrict vsrc = (HVX_Vector *) src; - - assert((unsigned long) dst % 128 == 0); - assert((unsigned long) src % 128 == 0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v); - } -} - -// copy n fp16 elements : source is aligned, destination is potentially unaligned -static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_UVector * restrict vdst = (HVX_UVector *) dst; - HVX_Vector * restrict vsrc = (HVX_Vector *) src; - - assert((unsigned long) src % 128 == 0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v); - } -} - -// copy n fp16 elements : source is aligned, destination is potentially unaligned -static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; - HVX_UVector * restrict vsrc = (HVX_UVector *) src; - - assert((unsigned long) dst % 128 == 0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v); - } -} - -// copy n fp32 elements : source and destination are aligned to HVX Vector (128) -static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; - HVX_Vector * restrict vsrc = (HVX_Vector *) src; - - assert((unsigned long) dst % 128 == 0); - assert((unsigned long) src % 128 == 0); - - uint32_t nvec = n / 32; - uint32_t nloe = n % 32; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v); - } -} - -// copy n fp32 elements : source is aligned, destination is unaligned -static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_UVector * restrict vdst = (HVX_UVector *) dst; - HVX_Vector * restrict vsrc = (HVX_Vector *) src; - - assert((unsigned long) src % 128 == 0); - - uint32_t nvec = n / 32; - uint32_t nloe = n % 32; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v); - } -} - -// copy n fp32 elements : source is unaligned, destination is aligned -static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; - HVX_UVector * restrict vsrc = (HVX_UVector *) src; - - assert((unsigned long) dst % 128 == 0); - - uint32_t nvec = n / 32; - uint32_t nloe = n % 32; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v); - } -} - -// copy n fp32 elements : source is unaligned, destination unaligned -static inline void hvx_copy_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_UVector * restrict vdst = (HVX_UVector *) dst; - HVX_UVector * restrict vsrc = (HVX_UVector *) src; - - assert((unsigned long) dst % 128 == 0); - - uint32_t nvec = n / 32; - uint32_t nloe = n % 32; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - HVX_Vector v = vsrc[i]; - vdst[i] = v; - } - - if (nloe) { - HVX_Vector v = vsrc[i]; - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v); - } -} - -// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned -static inline void hvx_copy_fp16_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16 - HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32 - - const HVX_Vector zero = Q6_V_vsplat_R(0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - vdst[i] = Q6_Vh_vdeal_Vh(s_hf); - } - - if (nloe) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf)); - } -} - -// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned -static inline void hvx_copy_fp16_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16 - HVX_Vector * restrict vsrc = (HVX_Vector *) src; // fp32 - - const HVX_Vector zero = Q6_V_vsplat_R(0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - vdst[i] = Q6_Vh_vdeal_Vh(s_hf); - } - - if (nloe) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf)); - } -} - -// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned -static inline void hvx_copy_fp16_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; // fp16 - HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32 - - const HVX_Vector zero = Q6_V_vsplat_R(0); - - uint32_t nvec = n / 64; - uint32_t nloe = n % 64; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - vdst[i] = Q6_Vh_vdeal_Vh(s_hf); - } - - if (nloe) { - // Load y (fp32) and convert into fp16 - HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements - HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements - HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf)); - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf)); - } -} - -// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned -static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) { - HVX_Vector * restrict vdst = (HVX_Vector *) dst; - - HVX_Vector velem = hvx_vec_splat_fp32(elem); - - assert((unsigned long) dst % 128 == 0); - - uint32_t nvec = n / 32; - uint32_t nloe = n % 32; - - uint32_t i = 0; - - #pragma unroll(4) - for (; i < nvec; i++) { - vdst[i] = velem; - } - - if (nloe) { - hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem); - } -} - - -/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ -static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { - uint32_t left_off = (size_t) addr & (chunk_size - 1); - uint32_t right_off = left_off + n; - return right_off <= chunk_size; -} - -static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { - HVX_VectorAlias u = { .v = v }; - - const uint32_t n0 = n / 16; - const uint32_t n1 = n % 16; - int i = 0; - for (; i < n0; i++) { - htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16); - } - if (n1) { - htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1); - } -} - -static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) { - hvx_vec_dump_fp16_n(pref, v, 64); -} - -static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) { - union { - HVX_Vector v; - float d[32]; - } u = { .v = v }; - - const uint32_t n0 = n / 16; - const uint32_t n1 = n % 16; - int i = 0; - for (; i < n0; i++) { - htp_dump_fp32_line(pref, u.d + (16 * i), 16); - } - if (n1) { - htp_dump_fp32_line(pref, u.d + (16 * i), n1); - } -} - -static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - float d[32]; - } u = { .v = v }; - - FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1], - u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]); -} - -static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) { - hvx_vec_dump_fp32_n(pref, v, 32); -} - -static void hvx_vec_dump_int32(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - int32_t d[32]; - } u = { .v = v }; - - for (int i = 0; i < 32 / 16; i++) { - htp_dump_int32_line(pref, u.d + (16 * i), 16); - } -} - -static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - int32_t d[32]; - } u = { .v = v }; - - FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12], - u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]); -} - -static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - int8_t d[128]; - } u = { .v = v }; - - FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60], - u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]); -} - -static void hvx_vec_dump_int8(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - int8_t d[128]; - } u = { .v = v }; - - for (int i = 0; i < 128 / 16; i++) { - htp_dump_int8_line(pref, u.d + (16 * i), 16); - } -} - -static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) { - union { - HVX_Vector v; - uint8_t d[128]; - } u = { .v = v }; - - for (int i = 0; i < 128 / 16; i++) { - htp_dump_uint8_line(pref, u.d + (16 * i), 16); - } -} - -static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) { - typedef union { - HVX_Vector v; - int8_t d[128]; - } U; - - U u0 = { .v = v0 }; - U u1 = { .v = v1 }; - - for (int i = 0; i < n; i++) { - if (u0.d[i] != u1.d[i]) { - return false; - } - } - - return true; -} - -static inline float hvx_vec_get_fp32(HVX_Vector v) { - float __attribute__((aligned(128))) x; - hvx_vec_store_a(&x, 4, v); - return x; -} - -static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) { - unsigned int total = n * 4; // total vec nbytes - unsigned int width = 4; // int32 - - HVX_Vector sum = in, sum_t; - while (width < total) { - sum_t = Q6_V_vror_VR(sum, width); // rotate right - sum = Q6_Vw_vadd_VwVw(sum_t, sum); // elementwise sum - width = width << 1; - } - return sum; -} - -static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) { - return hvx_vec_int32_reduce_sum_n(in, 32); -} - -static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) { - unsigned int total = n * 4; // total vec nbytes - unsigned int width = 4; // fp32 nbytes - - HVX_Vector sum = in, sum_t; - while (width < total) { - sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width); // rotate right - sum = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t); // elementwise sum - width = width << 1; - } - return sum; -} - -static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) { - return hvx_vec_qf32_reduce_sum_n(in, 32); -} - -static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) { - unsigned int total = n * 4; // total vec nbytes - unsigned int width = 4; // fp32 nbytes - - HVX_Vector sum = in, sum_t; - while (width < total) { - sum_t = Q6_V_vror_VR(sum, width); // rotate right - sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum - width = width << 1; - } - return sum; -} - -static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) { - return hvx_vec_fp32_reduce_sum_n(in, 32); -} - -static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) { - unsigned total = 128; // total vec nbytes - unsigned width = 2; // fp16 nbytes - - HVX_Vector _max = in, _max_t; - while (width < total) { - _max_t = Q6_V_vror_VR(_max, width); // rotate right - _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max - width = width << 1; - } - - return _max; -} - -static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) { - unsigned total = 128; // total vec nbytes - unsigned width = 2; // fp32 nbytes - - HVX_Vector _max_t; - - _max = Q6_Vhf_vmax_VhfVhf(in, _max); - while (width < total) { - _max_t = Q6_V_vror_VR(_max, width); // rotate right - _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max - width = width << 1; - } - - return _max; -} - -static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) { - unsigned total = 128; // total vec nbytes - unsigned width = 4; // fp32 nbytes - - HVX_Vector _max = in, _max_t; - while (width < total) { - _max_t = Q6_V_vror_VR(_max, width); // rotate right - _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max - width = width << 1; - } - - return _max; -} - -static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) { - unsigned total = 128; // total vec nbytes - unsigned width = 4; // fp32 nbytes - - HVX_Vector _max_t; - - _max = Q6_Vsf_vmax_VsfVsf(in, _max); - while (width < total) { - _max_t = Q6_V_vror_VR(_max, width); // rotate right - _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max - width = width << 1; - } - - return _max; -} - -static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) { - // abs by clearing the fp16 sign bit - HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff); - return Q6_V_vand_VV(v, mask); -} - -static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) { - // neg by setting the fp16 sign bit - HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); - return Q6_V_vxor_VV(v, mask); -} - -static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) { - // abs by clearing the fp32 sign bit - HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff); - return Q6_V_vand_VV(v, mask); -} - -static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) { -#if __HVX_ARCH__ > 75 - return Q6_Vsf_vfneg_Vsf(v); -#else - // neg by setting the fp32 sign bit - HVX_Vector mask = Q6_V_vsplat_R(0x80000000); - return Q6_V_vxor_VV(v, mask); -#endif // __HVX_ARCH__ > 75 -} - -// ==================================================== -// FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5 -// Order:3; continuity: True; Ends forced: True -// Mode: unsigned; Result fractional bits: 14 -// Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05 -// 32769 -32706 31252 -10589 -// 32590 -30635 22793 -4493 -// 32066 -27505 16481 -2348 -// 31205 -24054 11849 -1306 - -static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) { - // input is 0..0xffff representing 0.0 .. 1.0 - HVX_Vector p; - p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull); - p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull); - p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull); - p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull); - return p; // signed result, 14 fractional bits -} - -// Find reciprocal of fp16. -// (1) first, convert to fp32, multiplying by 1.0; this is done to -// handle denormals. Ignoring sign and zero, result should be at -// least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000) -// (exponent in range [103,143]) -// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly -// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32 -// (4) convert that to fp16 -// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace -// the result with the max value. -static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) { - HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF); - HVX_Vector avals = Q6_V_vand_VV(vals, em_mask); - HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals); - // is too small to 1/x ? for 'standard' fp16, this would be 0x101 - HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals); - - HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0 - HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32)); - HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32)); - - // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector - HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9)); - // likewise extract the upper 16 from each, containing the exponents in range 103..142 - HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0); - //Get exponent in IEEE 32-bit representation - exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7); - - // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane - // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0) - // Use poly to transform to 1/x, with 14 fractional bits - // - HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16); - - HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros - - // Get mantissa for 16-bit represenation - HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF)); - - //Compute Reciprocal Exponent - HVX_Vector exp_recip = - Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1))); - //Convert it for 16-bit representation - exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15)); - exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10); - - //Merge exponent and mantissa for reciprocal - HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip); - // map 'small' inputs to standard largest value 0x7bff - recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip); - // add sign back - recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000); - return recip; -} - -#define IEEE_VSF_EXPLEN (8) -#define IEEE_VSF_EXPBIAS (127) -#define IEEE_VSF_EXPMASK (0xFF) -#define IEEE_VSF_MANTLEN (23) -#define IEEE_VSF_MANTMASK (0x7FFFFF) -#define IEEE_VSF_MIMPMASK (0x800000) - -static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) { - HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK); - HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK); - HVX_Vector const_zero_v = Q6_V_vzero(); - - HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec); - - HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN; - expval_v &= IEEE_VSF_EXPMASK; - expval_v -= IEEE_VSF_EXPBIAS; - - // negative exp == fractional value - HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v); - - HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v; // fractional bits - exp shift - - HVX_Vector mant_v = in_vec & mask_mant_v; // obtain mantissa - HVX_Vector vout = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v); // add implicit 1.0 - - vout = Q6_Vw_vasr_VwVw(vout, rshift_v); // shift to obtain truncated integer - vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout); // expval<0 -> 0 - - HVX_Vector neg_vout = -vout; - - vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout); // handle negatives - - return (vout); -} - -static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) { - HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK); - HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK); - HVX_Vector const_mnlen_v = Q6_V_vsplat_R(IEEE_VSF_MANTLEN); - HVX_Vector const_zero_v = Q6_V_vzero(); - HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000); // -1 IEEE vsf - - HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec); - - HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN; - expval_v &= IEEE_VSF_EXPMASK; - expval_v -= IEEE_VSF_EXPBIAS; - - HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v); - HVX_VectorPred q_expltmn = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v); - HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v); - HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec); - - // if expval < 0 (q_negexp) // <0, floor is 0 - // if vin > 0 - // floor = 0 - // if vin < 0 - // floor = -1 - // if expval < mant_len (q_expltmn) // >0, but fraction may exist - // get sign (q_negative) - // mask >> expval // fraction bits to mask off - // vout = ~(mask) // apply mask to remove fraction - // if (qneg) // negative floor is one less (more, sign bit for neg) - // vout += ((impl_mask) >> expval) - // if (mask && vin) - // vout = vin - // else // already an integer - // ; // no change - - // compute floor - mask_mant_v >>= expval_v; - HVX_Vector neg_addin_v = mask_impl_v >> expval_v; - HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v); - HVX_Vector vout = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec); - - HVX_Vector mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v); // chk if bits set - HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v); - - HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v); // frac bits to clear - HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v); // clear frac bits - - vout = in_vec; - vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout); // expval0 -> 0 - vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout); // expval<0 x<0 -> -1 - - return vout; -} - -static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) { - // This looks complicated. - // Ideally should just be Q6_Vh_equals_Vhf(vin) - // but that instruction does not do proper rounding. - - // convert to qf32, multiplying by 1.0 in the process. - HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00)); - - // 'in-range' values are +/32752. - // add 192K to it, convert to sf - HVX_Vector v192K = Q6_V_vsplat_R(0x48400000); - HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K)); - HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K)); - - // for in-range cases, result is {163858... 229360} so the exponent is always 144. - // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer. - // Start by <<10 to get the final 'sign' bit in bit 15... - vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10); - vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10); - - // now round down to 16 - return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0); -} - -static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { - HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3); - HVX_Vector two_sf = hvx_vec_splat_fp32(2.0); - - // First approximation - HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf); - - HVX_Vector r_qf; - - // Refine - r_qf = Q6_Vqf32_vmpy_VsfVsf( - i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf))))); - r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( - r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); - r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( - r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); - - return Q6_Vsf_equals_Vqf32(r_qf); -} - -#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 -#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 -#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 -#define FAST_SIGMOID_C3 (0x3f000000) // 0.5 - -static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) { - v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F)); - v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3)); - - HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v)); - HVX_Vector x = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int)); - HVX_Vector xx = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x); - - HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2)); - v1 = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F)); - - HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1)); - v2 = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx); - v2 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x); - - HVX_Vector v3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1)); - HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1); - v3_exponent = Q6_Vuw_vlsr_VuwR(v3_exponent, 24); - v3_exponent = Q6_Vw_vadd_VwVw(in_int, v3_exponent); - v3 = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24); - - HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1)); - HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4)); - - HVX_Vector res = hvx_vec_inverse_fp32(v5); - res = Q6_Vqf32_vmpy_VsfVsf(v3, res); - - return Q6_Vsf_equals_Vqf32(res); -} - -#define EXP_COEFF_5 (0x39506967) // 0.000198757 = 1/(7!) -#define EXP_COEFF_4 (0x3AB743CE) // 0.0013982 = 1/(6!) -#define EXP_COEFF_3 (0x3C088908) // 0.00833345 = 1/(5!) -#define EXP_COEFF_2 (0x3D2AA9C1) // 0.416658 = 1/(4!) -#define EXP_COEFF_1 (0x3E2AAAAA) // 0.16666667 = 1/(3!) -#define EXP_COEFF_0 (0x3F000000) // 0.5 = 1/(2!) -#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805 -#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408 -#define EXP_ONE (0x3f800000) // 1.0 -#define EXP_RANGE_R (0x41a00000) // 20.0 -#define EXP_RANGE_L (0xc1a00000) // -20.0 - -static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) { - HVX_Vector z_qf32_v; - HVX_Vector x_v; - HVX_Vector x_qf32_v; - HVX_Vector y_v; - HVX_Vector k_v; - HVX_Vector f_v; - HVX_Vector epsilon_v; - HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E); - HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2); - HVX_Vector E_const; - HVX_Vector zero_v = Q6_V_vzero(); - - // exp(x) is approximated as follows: - // f = floor(x/ln(2)) = floor(x*log2(e)) - // epsilon = x - f*ln(2) - // exp(x) = exp(epsilon+f*ln(2)) - // = exp(epsilon)*exp(f*ln(2)) - // = exp(epsilon)*2^f - // - // Since epsilon is close to zero, it can be approximated with its Taylor series: - // exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+... - // Preserving the first eight elements, we get: - // exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7 - // = 1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2 - - HVX_Vector temp_v = in_vec; - - // Clamp inputs to (-20.0, 20.0) - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R)); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec); - - in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v); - in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v); - - epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec); - epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v); - - // f_v is the floating point result and k_v is the integer result - f_v = hvx_vec_floor_fp32(epsilon_v); - k_v = hvx_vec_truncate_fp32(f_v); - - x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v); - - // x = x - f_v * logn2; - epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2); - x_qf32_v = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v); - // normalize before every QFloat's vmpy - x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v); - - // z = x * x; - z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v); - z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v); - - x_v = Q6_Vsf_equals_Vqf32(x_qf32_v); - - // y = E4 + E5 * x; - E_const = Q6_V_vsplat_R(EXP_COEFF_5); - y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v); - E_const = Q6_V_vsplat_R(EXP_COEFF_4); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = E3 + y * x; - E_const = Q6_V_vsplat_R(EXP_COEFF_3); - y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = E2 + y * x; - E_const = Q6_V_vsplat_R(EXP_COEFF_2); - y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = E1 + y * x; - E_const = Q6_V_vsplat_R(EXP_COEFF_1); - y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = E0 + y * x; - E_const = Q6_V_vsplat_R(EXP_COEFF_0); - y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = x + y * z; - y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v); - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v); - - // y = y + 1.0; - y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE)); - - // insert exponents - // y = ldexpf(y, k); - // y_v += k_v; // qf32 - // modify exponent - - y_v = Q6_Vsf_equals_Vqf32(y_v); - - // add k_v to the exponent of y_v - HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1); - - y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1); - y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent); - - // exponent cannot be negative; if overflow is detected, result is set to zero - HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent); - - y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN); - - y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v); - - return y_v; -} - -#define RSQRT_CONST 0x5f3759df // Constant for fast inverse square root calculation -#define RSQRT_ONE_HALF 0x3f000000 // 0.5 -#define RSQRT_THREE_HALVES 0x3fc00000 // 1.5 - -static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { - //Algorithm : - // x2 = input*0.5 - // y = * (long *) &input - // y = 0x5f3759df - (y>>2) - // y = y*(threehalfs - x2*y*y) - - HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST); - HVX_Vector onehalf = Q6_V_vsplat_R(RSQRT_ONE_HALF); - HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES); - - HVX_Vector x2, y, ypower2, temp; - - x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf); - x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero()); - - y = Q6_Vw_vasr_VwR(in_vec, 1); - y = Q6_Vw_vsub_VwVw(rsqrtconst, y); - - // 1st iteration - ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y); - ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); - temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); - temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); - temp = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp)); - - // 2nd iteration - y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero()); - ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y); - ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); - temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); - temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); - temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp); - - // 3rd iteration - y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero()); - ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y); - ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero()); - temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2); - temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp)); - temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp); - - return Q6_Vsf_equals_Vqf32(temp); -} - -static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v, - HVX_Vector one, - HVX_Vector max_exp, - HVX_Vector min_exp) { - const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); - const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); - - HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); - out = Q6_V_vmux_QVV(pred_max, out, one); - return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero()); -} - -static inline HVX_Vector hvx_vec_tanh_fp32(HVX_Vector x) { - // tanh(x) = 2 * sigmoid(2x) - 1 - HVX_Vector two = hvx_vec_splat_fp32(2.0f); - HVX_Vector one = hvx_vec_splat_fp32(1.0f); - HVX_Vector x2 = Q6_Vqf32_vmpy_VsfVsf(x, two); - - static const float kMinExp = -87.f; // 0 - static const float kMaxExp = 87.f; // 1 - HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - - HVX_Vector sig2x = hvx_vec_fast_sigmoid_fp32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp); - - HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two); - res = Q6_Vqf32_vsub_Vqf32Vsf(res, one); - return Q6_Vsf_equals_Vqf32(res); -} - -static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { - int step_of_1 = num_elems >> 5; - int remaining = num_elems - step_of_1 * VLEN_FP32; - - const HVX_Vector * restrict v_src = (HVX_Vector *) src; - HVX_Vector * restrict v_dst = (HVX_Vector *) dst; - - static const float kMinExp = -87.f; // 0 - static const float kMaxExp = 87.f; // 1 - - const HVX_Vector one = hvx_vec_splat_fp32(1.f); - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - - #pragma unroll(4) - for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); - } - - if (remaining > 0) { - const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32; - float * dstf = (float *) dst + step_of_1*VLEN_FP32; - - HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp); - hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out); - } -} - -static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover = num_elems - (step_of_1 * VLEN_FP32); - - int32_t leftover_size = leftover * sizeof(float); - - static const float kMinExp = -87.f; // 0 - static const float kMaxExp = 87.f; // 1 - - const HVX_Vector one = hvx_vec_splat_fp32(1.f); - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - - const float *input = (float *)src; - float *output = (float *)dst; - - HVX_Vector * input_v_ptr = (HVX_Vector *) input; - HVX_UVector * output_v_ptr = (HVX_UVector *) output; - - HVX_Vector slinep; - HVX_Vector slinec; - HVX_Vector sline; - - slinep = *input_v_ptr++; - #pragma unroll(4) - for (int i = step_of_1 - 1; i > 0; i--) { - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - /* Prepare slinep for next iteration */ - slinep = slinec; - } - - if (step_of_1 > 0) { - slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - ; - - slinep = slinec; - } - if (leftover > 0) { - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - - HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); - } -} - -static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { - int nvec = n / VLEN_FP32; - int nloe = n % VLEN_FP32; - - HVX_Vector vs = hvx_vec_splat_fp32(scale); - - HVX_Vector * vsrc = (HVX_Vector *) src; - HVX_Vector * vdst = (HVX_Vector *) dst; - - uint32_t i = 0; - - #pragma unroll(4) - for (i = 0; i < nvec; ++i) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); - vdst[i] = Q6_Vsf_equals_Vqf32(v); - } - - if (nloe) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); - hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v)); - } -} - -static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { - int nvec = n / VLEN_FP32; - int nloe = n % VLEN_FP32; - - HVX_Vector vs = hvx_vec_splat_fp32(scale); - - HVX_UVector * vsrc = (HVX_UVector *) src; - HVX_UVector * vdst = (HVX_UVector *) dst; - - uint32_t i = 0; - - #pragma unroll(4) - for (i = 0; i < nvec; ++i) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); - vdst[i] = Q6_Vsf_equals_Vqf32(v); - } - - if (nloe) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); - hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v)); - } -} - -static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) { - if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) { - hvx_scale_f32_aa(dst, src, n, scale); - } else { - hvx_scale_f32_uu(dst, src, n, scale); - } -} - -static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { - int nvec = n / VLEN_FP32; - int nloe = n % VLEN_FP32; - - HVX_Vector vs = hvx_vec_splat_fp32(scale); - HVX_Vector vo = hvx_vec_splat_fp32(offset); - - HVX_Vector * vsrc = (HVX_Vector *) src; - HVX_Vector * vdst = (HVX_Vector *) dst; - - uint32_t i = 0; - - #pragma unroll(4) - for (i = 0; i < nvec; ++i) { - HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); - vdst[i] = Q6_Vsf_equals_Vqf32(v); - } - - if (nloe) { - HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); - hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v)); - } -} - -static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { - int nvec = n / VLEN_FP32; - int nloe = n % VLEN_FP32; - - HVX_Vector vs = hvx_vec_splat_fp32(scale); - HVX_Vector vo = hvx_vec_splat_fp32(offset); - - HVX_UVector * vsrc = (HVX_UVector *) src; - HVX_UVector * vdst = (HVX_UVector *) dst; - - uint32_t i = 0; - - #pragma unroll(4) - for (i = 0; i < nvec; ++i) { - HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); - vdst[i] = Q6_Vsf_equals_Vqf32(v); - } - - if (nloe) { - HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); - hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v)); - } -} - -static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) { - if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) { - hvx_scale_offset_f32_aa(dst, src, n, scale, offset); - } else { - hvx_scale_offset_f32_uu(dst, src, n, scale, offset); - } -} - -float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); -void hvx_mul_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_mul_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_mul_mul_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - const uint8_t * restrict src2, - uint8_t * restrict dst, - const int num_elems); -void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems); -void hvx_add_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_add_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems); -void hvx_sub_f32(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_sub_f32_opt(const uint8_t * restrict src0, - const uint8_t * restrict src1, - uint8_t * restrict dst, - const int num_elems); -void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems); -void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems); -void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems); -void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate); -float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems); -float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems); -void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems); -void hvx_clamp_scalar_f32(const uint8_t * restrict src, - const float limit_left, - const float limit_right, - uint8_t * restrict dst, - const int num_elems); +#include "hex-utils.h" + +#include "hvx-types.h" +#include "hvx-copy.h" +#include "hvx-scale.h" +#include "hvx-exp.h" +#include "hvx-inverse.h" +#include "hvx-reduce.h" +#include "hvx-sigmoid.h" +#include "hvx-sqrt.h" +#include "hvx-arith.h" +#include "hvx-base.h" #endif /* HVX_UTILS_H */ diff --git a/src/ggml-hexagon/htp/main.c b/src/ggml-hexagon/htp/main.c index 24b3e90e..e28a67a9 100644 --- a/src/ggml-hexagon/htp/main.c +++ b/src/ggml-hexagon/htp/main.c @@ -1,17 +1,13 @@ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" #pragma clang diagnostic ignored "-Wunused-function" -#define FARF_ERROR 1 -#define FARF_HIGH 1 -#define FARF_MEDIUM 0 -#define FARF_LOW 0 +#include +#include #include #include #include #include -#include #include -#include #include #include #include @@ -19,13 +15,14 @@ #include #include +#include "hex-dma.h" +#include "hex-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "ops-utils.h" #include "worker-pool.h" AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { @@ -362,14 +359,14 @@ struct profile_data { static inline void profile_start(struct profile_data * d) { d->usecs = HAP_perf_get_qtimer_count(); - d->cycles = htp_get_cycles(); - d->pkts = htp_get_pktcnt(); + d->cycles = hex_get_cycles(); + d->pkts = hex_get_pktcnt(); } static inline void profile_stop(struct profile_data * d) { d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs); - d->cycles = htp_get_cycles() - d->cycles; - d->pkts = htp_get_pktcnt() - d->pkts; + d->cycles = hex_get_cycles() - d->cycles; + d->pkts = hex_get_pktcnt() - d->pkts; } static int send_htp_rsp(struct htp_context * c, @@ -443,6 +440,43 @@ static void proc_matmul_req(struct htp_context * ctx, send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); } +static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { + struct dspqueue_buffer rsp_bufs[1]; + + // We had written to the output buffer, we'd also need to flush it + rsp_bufs[0].fd = bufs[1].fd; + rsp_bufs[0].ptr = bufs[1].ptr; + rsp_bufs[0].offset = bufs[1].offset; + rsp_bufs[0].size = bufs[1].size; + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + + // Setup Op context + struct htp_ops_context octx = { 0 }; + octx.ctx = ctx; + octx.src0 = req->src0; + octx.dst = req->dst; + octx.flags = req->flags; + octx.op = req->op; + + // Update data pointers + octx.src0.data = (uint32_t) bufs[0].ptr; + octx.dst.data = (uint32_t) bufs[1].ptr; + octx.n_threads = ctx->n_threads; + + struct profile_data prof; + profile_start(&prof); + + uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; + if (vtcm_acquire(ctx) == AEE_SUCCESS) { + rsp_status = op_cpy(&octx); + vtcm_release(ctx); + } + + profile_stop(&prof); + send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); +} + static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { struct dspqueue_buffer rsp_bufs[1]; @@ -993,6 +1027,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { proc_get_rows_req(ctx, &req, bufs); break; + case HTP_OP_CPY: + if (n_bufs != 2) { + FARF(ERROR, "Bad cpy-req buffer list"); + continue; + } + proc_cpy_req(ctx, &req, bufs); + break; + default: FARF(ERROR, "Unknown Op %u", req.op); break; diff --git a/src/ggml-hexagon/htp/matmul-ops.c b/src/ggml-hexagon/htp/matmul-ops.c index 9bb39db9..1603ff2b 100644 --- a/src/ggml-hexagon/htp/matmul-ops.c +++ b/src/ggml-hexagon/htp/matmul-ops.c @@ -3,28 +3,20 @@ #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif - #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" #define MM_SPAD_SRC0_NROWS 16 #define MM_SPAD_SRC1_NROWS 16 @@ -36,20 +28,8 @@ struct htp_matmul_type { void (*vec_dot_rx2)(const int n, float * restrict s, const void * restrict vx, uint32_t vx_row_size, const void * restrict vy); }; -typedef struct { - HVX_Vector v[2]; -} HVX_Vector_x2; - -typedef struct { - HVX_Vector v[4]; -} HVX_Vector_x4; - -typedef struct { - HVX_Vector v[8]; -} HVX_Vector_x8; - // vdelta control to replicate first 4x fp32 values across lanes -static const uint8_t __attribute__((aligned(128))) repl_4x_fp32[128] = { +static const uint8_t __attribute__((aligned(128))) repl_4x_f32[128] = { 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, @@ -60,7 +40,7 @@ static const uint8_t __attribute__((aligned(128))) repl_4x_fp32[128] = { }; // vdelta control to replicate and interleave first 8x fp32 values across lanes -static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_fp32[128] = { +static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_f32[128] = { 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, @@ -71,7 +51,7 @@ static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_fp32[128] }; // vdelta control to replicate first fp32 value across all elements -static const uint8_t __attribute__((aligned(128))) repl_1x_fp32[128] = { +static const uint8_t __attribute__((aligned(128))) repl_1x_f32[128] = { 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, @@ -82,7 +62,7 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp32[128] = { }; // vdelta control to replicate first fp16 value across all elements -static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = { +static const uint8_t __attribute__((aligned(128))) repl_1x_f16[128] = { 0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, @@ -93,7 +73,7 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = { }; // vdelta control to replicate first fp16 value across all elements -static const uint8_t __attribute__((aligned(128))) repl_2x_fp16[128] = { +static const uint8_t __attribute__((aligned(128))) repl_2x_f16[128] = { 0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, @@ -129,7 +109,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) { // ensures perfect alignment of quants and full row const uint32_t qk = QK_Q8_0x4x2; const uint32_t nb = (ne + qk - 1) / qk; - return htp_round_up(ne + nb * 8 * sizeof(__fp16), 128); + return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128); } static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) { @@ -389,7 +369,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * } // Reduce and convert into fp32 - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -485,8 +465,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -562,7 +542,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * } // Reduce and convert into fp32 - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -658,8 +638,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -768,7 +748,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, } // Reduce and convert into fp32 - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -900,8 +880,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -933,7 +913,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum)); + rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } @@ -977,8 +957,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n, rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf))); } - rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum0)); - rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum1)); + rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0)); + rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -1010,7 +990,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum)); + rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } @@ -1062,7 +1042,7 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum)); + rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } @@ -1359,7 +1339,7 @@ static void matvec_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); } - hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); + hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); t2 = HAP_perf_get_qtimer_count(); @@ -1411,7 +1391,7 @@ static void matmul_id(struct htp_matmul_type * mt, struct htp_ops_context * octx const size_t src0_row_size = nb01; const size_t src1_row_size = q8x4x2_row_size(ne10); - const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128); + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); // Per-thread VTCM scratchpads for all tensors // Note that the entire src1 tensor is already in VTCM @@ -1524,7 +1504,7 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx const size_t src0_row_size = nb01; const size_t src1_row_size = q8x4x2_row_size(ne10); - const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128); + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); const uint32_t n_aids = src2->ne[0]; // num activated experts const uint32_t n_ids = ne02; // num experts @@ -1590,7 +1570,7 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx // *** dynamic quant -static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { +static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { assert((unsigned long) x % 128 == 0); assert((unsigned long) y_q % 128 == 0); @@ -1598,10 +1578,10 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri HVX_Vector zero = Q6_V_vsplat_R(0); // Use reduce max fp32 to find max(abs(e)) first - HVX_Vector vmax0_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[0])); - HVX_Vector vmax1_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[1])); - HVX_Vector vmax2_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[2])); - HVX_Vector vmax3_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[3])); + HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); + HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); + HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); + HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); // Load and convert into QF32 HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements @@ -1623,7 +1603,7 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); // Replicate first fp16 scale across all lanes - HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_fp16; + HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_f16; vmax01_hf = Q6_V_vdelta_VV(vmax01_hf, ctrl); vmax23_hf = Q6_V_vdelta_VV(vmax23_hf, ctrl); @@ -1641,8 +1621,8 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf); // Divide input by the scale - HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf); - HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf); + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); @@ -1654,7 +1634,7 @@ static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restri *(HVX_Vector *) y_q = vx_i8; } -static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { +static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { assert((unsigned long) x % 128 == 0); assert((unsigned long) y_q % 128 == 0); @@ -1672,11 +1652,11 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); // Compute max and scale - HVX_Vector vmax01_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf)); - HVX_Vector vmax23_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx23_hf)); + HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); + HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf)); // Replicate first fp16 scale across all lanes - HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16; + HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16; vmax01_hf = Q6_V_vdelta_VV(vmax01_hf, ctrl); vmax23_hf = Q6_V_vdelta_VV(vmax23_hf, ctrl); @@ -1689,8 +1669,8 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri hvx_vec_store_u(y_d + 4, 4, vd23_hf); // Divide input by the scale - HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf); - HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf); + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); @@ -1702,7 +1682,7 @@ static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restri *(HVX_Vector *) y_q = vx_i8; } -static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { +static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { assert((unsigned long) x % 128 == 0); assert((unsigned long) y_q % 128 == 0); @@ -1720,11 +1700,11 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); // Compute max and scale - HVX_Vector vmax_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf)); - vmax_hf = hvx_vec_reduce_max2_fp16(hvx_vec_abs_fp16(vx23_hf), vmax_hf); + HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); + vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); // Replicate first fp16 scale across all lanes - HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16; + HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16; vmax_hf = Q6_V_vdelta_VV(vmax_hf, ctrl); HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 @@ -1733,7 +1713,7 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri *(HVX_UVector *) y_d = vd_hf; // Divide input by the scale - HVX_Vector vd_inv_hf = hvx_vec_inverse_fp16(vd_hf); + HVX_Vector vd_inv_hf = hvx_vec_inverse_f16(vd_hf); vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf)); vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf)); @@ -1746,7 +1726,7 @@ static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restri } // Overrides input x -static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { +static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { assert(k % 32 == 0); const uint32_t qk = QK_Q8_0x4x2; const uint32_t nb = (k + qk - 1) / qk; @@ -1764,24 +1744,24 @@ static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, u for (uint32_t i = 0; i < nb; i++) { #if FP32_QUANTIZE_GROUP_SIZE == 32 - quantize_block_fp32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_fp32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); + quantize_block_f32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); + quantize_block_f32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); #elif FP32_QUANTIZE_GROUP_SIZE == 64 - quantize_block_fp32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_fp32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); + quantize_block_f32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); + quantize_block_f32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); #elif FP32_QUANTIZE_GROUP_SIZE == 128 - quantize_block_fp32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_fp32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); + quantize_block_f32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); + quantize_block_f32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); #else #error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128" #endif } // now copy the scales into final location - hvx_copy_fp16_ua(y_d, t_d, nb * 8); + hvx_copy_f16_ua(y_d, t_d, nb * 8); } -static void quantize_fp32_q8x4x2(const struct htp_tensor * src, +static void quantize_f32_q8x4x2(const struct htp_tensor * src, uint8_t * restrict dst, struct htp_spad * spad, uint32_t nth, @@ -1807,26 +1787,26 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first); uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith); - const size_t src_row_size_padded = htp_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float)); + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float)); memset(tmp_data, 0, src_row_size_padded); // zero-out temp row data for padding for (uint32_t i = ir_first; i < ir_last; ++i) { - htp_l2fetch(src_data, 2, src_row_size, src_row_size); - hvx_copy_fp32_aa(tmp_data, src_data, ne0); + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); // FARF(HIGH, "quantize-q8x4-row: %u\n", i); - quantize_row_fp32_q8x4x2((float *) tmp_data, dst_data, ne0); + quantize_row_f32_q8x4x2((float *) tmp_data, dst_data, ne0); dst_data += dst_row_size; src_data += src_row_size; } uint64_t t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, + FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith, +static void quantize_f32_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith, uint32_t nrows_per_thread, uint32_t dst_stride) { uint64_t t1 = HAP_perf_get_qtimer_count(); @@ -1848,8 +1828,8 @@ static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first); for (uint32_t i = ir_first; i < ir_last; ++i) { - htp_l2fetch(src_data, 2, src_row_size, src_stride); - hvx_copy_fp16_fp32_au(dst_data, src_data, ne0); + hex_l2fetch(src_data, src_row_size, src_stride, 2); + hvx_copy_f16_f32_au(dst_data, src_data, ne0); dst_data += dst_stride; src_data += src_stride; @@ -1857,12 +1837,12 @@ static void quantize_fp32_fp16(const struct htp_tensor * src, uint8_t * restrict uint64_t t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "quantize-fp32-fp16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, + FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } // TODO just a plain copy that should be done via the DMA during the Op setup -static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith, +static void quantize_f16_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith, uint32_t nrows_per_thread, uint32_t dst_stride) { uint64_t t1 = HAP_perf_get_qtimer_count(); @@ -1884,8 +1864,8 @@ static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first); for (uint32_t i = ir_first; i < ir_last; ++i) { - htp_l2fetch(src_data, 2, src_row_size, src_stride); - hvx_copy_fp16_au(dst_data, src_data, ne0); + hex_l2fetch(src_data, src_row_size, src_stride, 2); + hvx_copy_f16_au(dst_data, src_data, ne0); dst_data += dst_stride; src_data += src_stride; @@ -1893,23 +1873,23 @@ static void quantize_fp16_fp16(const struct htp_tensor * src, uint8_t * restrict uint64_t t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "quantize-fp16-fp16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, + FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) { +static void htp_quantize_f32_q8x4x2(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = data; - quantize_fp32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread); + quantize_f32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread); } -static void htp_quantize_fp32_fp16(unsigned int n, unsigned int i, void * data) { +static void htp_quantize_f32_f16(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = data; - quantize_fp32_fp16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride); + quantize_f32_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride); } -static void htp_quantize_fp16_fp16(unsigned int n, unsigned int i, void * data) { +static void htp_quantize_f16_f16(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = data; - quantize_fp16_fp16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride); + quantize_f16_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride); } // ** matmul/matvec callbacks for worker_pool @@ -2108,7 +2088,7 @@ int op_matmul(struct htp_ops_context * octx) { const size_t dst_row_size = nb1; size_t src1_row_size = nb11; - const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128); + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); size_t src1_row_size_padded; worker_callback_t quant_job_func; @@ -2118,8 +2098,8 @@ int op_matmul(struct htp_ops_context * octx) { switch (src0->type) { case HTP_TYPE_Q4_0: - op_type = "q4x4x2-fp32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + op_type = "q4x4x2-f32"; + quant_job_func = htp_quantize_f32_q8x4x2; if (src1_nrows > 1) { matmul_job_func = htp_matmul_2d_q4x4x2_q8x4x2; } else { @@ -2131,12 +2111,12 @@ int op_matmul(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } @@ -2147,8 +2127,8 @@ int op_matmul(struct htp_ops_context * octx) { break; case HTP_TYPE_Q8_0: - op_type = "q8x4x2-fp32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + op_type = "q8x4x2-f32"; + quant_job_func = htp_quantize_f32_q8x4x2; if (src1_nrows > 1) { matmul_job_func = htp_matmul_2d_q8x4x2_q8x4x2; } else { @@ -2160,12 +2140,12 @@ int op_matmul(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } @@ -2177,7 +2157,7 @@ int op_matmul(struct htp_ops_context * octx) { case HTP_TYPE_MXFP4: op_type = "mxfp4x4x2-f32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + quant_job_func = htp_quantize_f32_q8x4x2; if (src1_nrows > 1) { matmul_job_func = htp_matmul_2d_mxfp4x4x2_q8x4x2; } else { @@ -2189,12 +2169,12 @@ int op_matmul(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } @@ -2207,10 +2187,10 @@ int op_matmul(struct htp_ops_context * octx) { case HTP_TYPE_F16: { // Try optimized f16-f16 path first (src1 in VTCM) - const size_t f16_src1_row_size = htp_round_up(ne10 * 2, 128); - const size_t f16_src1_spad_size = htp_round_up(f16_src1_row_size * src1_nrows, 256); - const size_t f16_src0_spad_size = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads; - const size_t f16_dst_spad_size = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads; + const size_t f16_src1_row_size = hex_round_up(ne10 * 2, 128); + const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256); + const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads; + const size_t f16_dst_spad_size = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads; const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size; @@ -2222,7 +2202,7 @@ int op_matmul(struct htp_ops_context * octx) { if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) { // Optimized path op_type = "f16-f16"; - quant_job_func = (src1->type == HTP_TYPE_F32) ? htp_quantize_fp32_fp16 : htp_quantize_fp16_fp16; + quant_job_func = (src1->type == HTP_TYPE_F32) ? htp_quantize_f32_f16 : htp_quantize_f16_f16; if (src1_nrows > 1) { matmul_job_func = htp_matmul_2d_f16_f16; } else { @@ -2231,9 +2211,9 @@ int op_matmul(struct htp_ops_context * octx) { src1_row_size = f16_src1_row_size; // row size post quantization - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); octx->src1_spad.size = octx->src1_spad.size_per_thread; octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; @@ -2251,9 +2231,9 @@ int op_matmul(struct htp_ops_context * octx) { src1_row_size = nb11; // original row size in DDR - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256); - octx->src1_spad.size_per_thread = htp_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256); + octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256); octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads; @@ -2332,7 +2312,7 @@ int op_matmul_id(struct htp_ops_context * octx) { const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; - const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128); + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); const uint32_t src0_nrows = ne01; // per expert const uint32_t src1_nrows = ne11 * ne12 * ne13; @@ -2350,7 +2330,7 @@ int op_matmul_id(struct htp_ops_context * octx) { switch (src0->type) { case HTP_TYPE_Q4_0: op_type = "q4x2x2-f32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + quant_job_func = htp_quantize_f32_q8x4x2; src1_row_size = q8x4x2_row_size(ne10); // row size post quantization if (src1_nrows > 1) { matmul_id_job_func = htp_matmul_id_q4x4x2_q8x4x2; @@ -2360,13 +2340,13 @@ int op_matmul_id(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); - octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); + octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } @@ -2379,7 +2359,7 @@ int op_matmul_id(struct htp_ops_context * octx) { case HTP_TYPE_Q8_0: op_type = "q8x2x2-f32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + quant_job_func = htp_quantize_f32_q8x4x2; src1_row_size = q8x4x2_row_size(ne10); // row size post quantization if (src1_nrows > 1) { matmul_id_job_func = htp_matmul_id_q8x4x2_q8x4x2; @@ -2389,13 +2369,13 @@ int op_matmul_id(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); - octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); + octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } @@ -2408,7 +2388,7 @@ int op_matmul_id(struct htp_ops_context * octx) { case HTP_TYPE_MXFP4: op_type = "mxfp4x2x2-f32"; - quant_job_func = htp_quantize_fp32_q8x4x2; + quant_job_func = htp_quantize_f32_q8x4x2; src1_row_size = q8x4x2_row_size(ne10); // row size post quantization if (src1_nrows > 1) { matmul_id_job_func = htp_matmul_id_mxfp4x4x2_q8x4x2; @@ -2418,13 +2398,13 @@ int op_matmul_id(struct htp_ops_context * octx) { // Entire src1 tensor is placed into the VTCM // For other tensors we allocate N rows per thread, padded to HVX vector size - octx->dst_spad.size_per_thread = htp_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = htp_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256); - octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256); + octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); + octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); + octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); + octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256); // src0 spad is also used in dynamic quantizer to store padded src1 rows - src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); + src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); if (octx->src0_spad.size_per_thread < src1_row_size_padded) { octx->src0_spad.size_per_thread = src1_row_size_padded; } diff --git a/src/ggml-hexagon/htp/ops-utils.h b/src/ggml-hexagon/htp/ops-utils.h deleted file mode 100644 index af9c3305..00000000 --- a/src/ggml-hexagon/htp/ops-utils.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef OPS_UTILS_H -#define OPS_UTILS_H - -#include "htp-msg.h" - -#ifndef MAX -# define MAX(a, b) ((a) > (b) ? (a) : (b)) -#endif - -#ifndef MIN -# define MIN(a, b) ((a) < (b) ? (a) : (b)) -#endif - -static inline uint64_t htp_get_cycles() { - uint64_t cycles = 0; - asm volatile(" %0 = c15:14\n" : "=r"(cycles)); - return cycles; -} - -static inline uint64_t htp_get_pktcnt() { - uint64_t pktcnt; - asm volatile(" %0 = c19:18\n" : "=r"(pktcnt)); - return pktcnt; -} - -static inline int32_t htp_is_aligned(void * addr, uint32_t align) { - return ((size_t) addr & (align - 1)) == 0; -} - -static inline uint32_t htp_round_up(uint32_t n, uint32_t m) { - return m * ((n + m - 1) / m); -} - -// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. -// Precompute mp (m' in the paper) and L such that division -// can be computed using a multiply (high 32b of 64b result) -// and a shift: -// -// n/d = (mulhi(n, mp) + n) >> L; -struct fastdiv_values { - uint32_t mp; - uint32_t l; -}; - -static inline struct fastdiv_values init_fastdiv_values(uint32_t d) { - struct fastdiv_values result = { 0, 0 }; - // compute L = ceil(log2(d)); - while (result.l < 32 && ((uint32_t) 1 << result.l) < d) { - ++(result.l); - } - - result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1); - return result; -} - -static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) { - // Compute high 32 bits of n * mp - const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32); // mulhi(n, mp) - // add n, apply bit shift - return (hi + n) >> vals->l; -} - -static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) { - return n - fastdiv(n, vals) * d; -} - -static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) { - const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height)); - asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); -} - -static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { - uint32_t left_off = (size_t) addr & (chunk_size - 1); - uint32_t right_off = left_off + n; - return right_off <= chunk_size; -} - -static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) { - char str[1024], *p = str, *p_end = str + sizeof(str); - p += snprintf(p, p_end - p, "%s: ", pref); - for (int i = 0; i < n && p < p_end; i++) { - p += snprintf(p, p_end - p, "%d, ", x[i]); - } - FARF(HIGH, "%s\n", str); -} - -static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) { - char str[1024], *p = str, *p_end = str + sizeof(str); - p += snprintf(p, p_end - p, "%s: ", pref); - for (int i = 0; i < n && p < p_end; i++) { - p += snprintf(p, p_end - p, "%d, ", x[i]); - } - FARF(HIGH, "%s\n", str); -} - -static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) { - char str[1024], *p = str, *p_end = str + sizeof(str); - p += snprintf(p, p_end - p, "%s: ", pref); - for (int i = 0; i < n; i++) { - p += snprintf(p, p_end - p, "%d, ", (int) x[i]); - } - FARF(HIGH, "%s\n", str); -} - -static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) { - char str[1024], *p = str, *p_end = str + sizeof(str); - p += snprintf(p, p_end - p, "%s: ", pref); - for (int i = 0; i < n; i++) { - p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]); - } - FARF(HIGH, "%s\n", str); -} - -static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) { - char str[1024], *p = str, *p_end = str + sizeof(str); - p += snprintf(p, p_end - p, "%s: ", pref); - for (int i = 0; i < n; i++) { - p += snprintf(p, p_end - p, "%.6f, ", x[i]); - } - FARF(HIGH, "%s\n", str); -} - -static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) { - uint32_t n0 = n / 16; - uint32_t n1 = n % 16; - - uint32_t i = 0; - for (; i < n0; i++) { - htp_dump_fp32_line(pref, x + (16 * i), 16); - } - if (n1) { - htp_dump_fp32_line(pref, x + (16 * i), n1); - } -} - -static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) { - uint32_t n0 = n / 16; - uint32_t n1 = n % 16; - - uint32_t i = 0; - for (; i < n0; i++) { - htp_dump_fp16_line(pref, x + (16 * i), 16); - } - if (n1) { - htp_dump_fp16_line(pref, x + (16 * i), n1); - } -} - -#endif /* OPS_UTILS_H */ diff --git a/src/ggml-hexagon/htp/rope-ops.c b/src/ggml-hexagon/htp/rope-ops.c index a4399704..943ca5c9 100644 --- a/src/ggml-hexagon/htp/rope-ops.c +++ b/src/ggml-hexagon/htp/rope-ops.c @@ -2,27 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" // Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h #define HTP_ROPE_TYPE_NORMAL 0 @@ -370,8 +363,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int int is_aligned = 1; int opt_path = 0; - if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) || - (0 == htp_is_aligned((void *) dst->data, VLEN))) { + if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) || + (0 == hex_is_aligned((void *) dst->data, VLEN))) { FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n"); is_aligned = 0; } @@ -427,9 +420,9 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) { // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads; - octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads; + octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads; + octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads; + octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads; size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; diff --git a/src/ggml-hexagon/htp/set-rows-ops.c b/src/ggml-hexagon/htp/set-rows-ops.c index bdd64fcc..904484da 100644 --- a/src/ggml-hexagon/htp/set-rows-ops.c +++ b/src/ggml-hexagon/htp/set-rows-ops.c @@ -2,24 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include + #include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" #define set_rows_preamble \ const uint32_t ne00 = octx->src0.ne[0]; \ @@ -76,7 +72,7 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3; // copy row - hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); + hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); } } } @@ -112,7 +108,7 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03; uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3; - hvx_copy_fp16_fp32_uu(dst_ptr, src0_ptr, ne00); + hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00); } } } diff --git a/src/ggml-hexagon/htp/softmax-ops.c b/src/ggml-hexagon/htp/softmax-ops.c index 80d249a2..1b6b2eba 100644 --- a/src/ggml-hexagon/htp/softmax-ops.c +++ b/src/ggml-hexagon/htp/softmax-ops.c @@ -2,27 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" #define htp_softmax_preamble3 \ const uint32_t ne00 = src0->ne[0]; \ @@ -100,8 +93,8 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src, uint8_t * restrict dst_curr = dst; const uint8_t * restrict mask_curr = mask; - HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); - HVX_Vector slope_vec = hvx_vec_splat_fp32(slope); + HVX_Vector scale_vec = hvx_vec_splat_f32(scale); + HVX_Vector slope_vec = hvx_vec_splat_f32(slope); int step_of_1 = num_elems >> 5; @@ -134,9 +127,9 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, HVX_Vector * restrict v_dst = (HVX_Vector *) dst; HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); - HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]); + HVX_Vector max_vec = hvx_vec_splat_f32(((const float *) src)[0]); HVX_Vector zero_v = Q6_V_vzero(); - HVX_Vector one_v = hvx_vec_splat_fp32(1.0); + HVX_Vector one_v = hvx_vec_splat_f32(1.0); int step_of_1 = num_elems >> 5; @@ -146,7 +139,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, max_vec = Q6_Vsf_vmax_VsfVsf(max_vec, v1); } - HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec); + HVX_Vector v = hvx_vec_reduce_max_f32(max_vec); max_vec = hvx_vec_repl4(v); #pragma unroll(4) @@ -154,18 +147,18 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, HVX_Vector v1 = v_src[i]; HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec); - HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2)); + HVX_Vector v3 = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(v2)); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3); v_pad[i] = v3; } - v = hvx_vec_qf32_reduce_sum(sum_vec); + v = hvx_vec_reduce_sum_qf32(sum_vec); sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v)); HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v); - HVX_Vector v4 = hvx_vec_inverse_fp32(sum_vec); + HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec); HVX_Vector scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v); #pragma unroll(4) @@ -181,11 +174,11 @@ static float hvx_softmax_f32(const uint8_t * restrict src, uint8_t * restrict spad, const int num_elems, const float max) { - hvx_sub_scalar_f32(src, max, spad, num_elems); + hvx_sub_scalar_f32(spad, src, max, num_elems); hvx_exp_f32(spad, dst, num_elems, false); - float sum = hvx_self_sum_f32(dst, num_elems); + float sum = hvx_reduce_sum_f32(dst, num_elems); return sum; } @@ -255,7 +248,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct if (1 == opt_path) { hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00); } else { - float max = hvx_self_max_f32((const uint8_t *) wp0, ne00); + float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00); float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max); sum = sum > 0.0 ? (1.0 / sum) : 1; hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum); @@ -290,7 +283,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int int is_aligned = 1; int opt_path = 0; - if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { + if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) { is_aligned = 0; FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n"); } @@ -345,9 +338,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) { // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads; - octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads; + octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads; + octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads; + octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads; size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; diff --git a/src/ggml-hexagon/htp/unary-ops.c b/src/ggml-hexagon/htp/unary-ops.c index 8ed1e5b6..be8be8c4 100644 --- a/src/ggml-hexagon/htp/unary-ops.c +++ b/src/ggml-hexagon/htp/unary-ops.c @@ -2,28 +2,20 @@ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif - #include -#include #include -#include -#include -#include + #include -#include #include +#include "hex-dma.h" +#include "hvx-utils.h" + #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-dma.h" #include "htp-msg.h" #include "htp-ops.h" -#include "hvx-utils.h" -#include "ops-utils.h" #define htp_unary_preamble \ const uint32_t ne00 = src->ne[0]; \ @@ -55,7 +47,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, HVX_Vector * restrict v_dst = (HVX_Vector *) dst; HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000); - HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); int step_of_1 = num_elems >> 5; #pragma unroll(4) @@ -65,15 +57,15 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); } - HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v); + HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v); sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum)); - HVX_Vector t_v = hvx_vec_splat_fp32((float) num_elems); - HVX_Vector denom_v = hvx_vec_inverse_fp32(t_v); + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v); HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v); - HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v)); + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v)); #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { @@ -101,7 +93,7 @@ static void scale_htp_f32(const float * restrict src, float * restrict dst_local = dst + (ir * row_elems); if (ir + 1 < num_rows) { - htp_l2fetch(src_local + row_elems, 1, row_size, row_size); + hex_l2fetch(src_local + row_elems, row_size, row_size, 1); } hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias); @@ -124,7 +116,7 @@ static void rms_norm_htp_f32(const float * restrict src, float * restrict dst_local = dst + (ir * row_elems); if (ir + 1 < num_rows) { - htp_l2fetch(src_local + row_elems, 1, row_size, row_size); + hex_l2fetch(src_local + row_elems, row_size, row_size, 1); } if (1 == opt_path) { @@ -168,9 +160,8 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src, int is_aligned = 1; int opt_path = 0; - if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) { + if ((0 == hex_is_aligned((void *) src->data, VLEN)) || (0 == hex_is_aligned((void *) dst->data, VLEN))) { is_aligned = 0; - FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n"); } if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { opt_path = 1; @@ -240,8 +231,8 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const size_t dst_row_size = dst->nb[1]; // VTCM scratchpads for all tensors - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads; + octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads; + octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads; size_t spad_size = octx->src0_spad.size + octx->dst_spad.size; diff --git a/src/ggml-hexagon/htp/worker-pool.c b/src/ggml-hexagon/htp/worker-pool.c index cd38c212..894815f4 100644 --- a/src/ggml-hexagon/htp/worker-pool.c +++ b/src/ggml-hexagon/htp/worker-pool.c @@ -7,10 +7,6 @@ #include #include -#ifdef HTP_DEBUG -# define FARF_HIGH 1 -#endif - #include "HAP_farf.h" #define WORKER_THREAD_STACK_SZ (2 * 16384)