void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
-
+ int unalign_address = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+ unalign_address = 1;
}
- assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
const float * src_f = (const float *) src;
- HVX_Vector vec_min = Q6_V_vsplat_R(val);
+ HVX_Vector vec_min = hvx_vec_splat_fp32(val);
- HVX_Vector * restrict vec_in = (HVX_Vector *) src;
- HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+ if(unalign_address == 0){
+ HVX_Vector * restrict vec_in = (HVX_Vector *) src;
+ HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
- #pragma unroll(4)
- for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
- vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
- *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
+ #pragma unroll(4)
+ for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+ HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+ *vec_out++ = (min_clamp);
+ }
+ }else{
+ HVX_UVector * restrict vec_in = (HVX_Vector *) src;
+ HVX_UVector * restrict vec_out = (HVX_Vector *) dst;
+
+ #pragma unroll(4)
+ for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+ HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+ *vec_out++ = (min_clamp);
+ }
}
- if (left_over > 0) {
+ if (left_over > 0 ) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
- HVX_Vector in = *(HVX_UVector *) srcf;
+ HVX_UVector in = *(HVX_UVector *) srcf;
- vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
+ HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in);
- hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
+ hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp));
}
}
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
+ int unalign_address = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+ unalign_address = 1;
}
- assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
- HVX_Vector * restrict vec_in = (HVX_Vector *) src;
- HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
HVX_Vector range_left = hvx_vec_splat_fp32(limit_left);
HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
- #pragma unroll(4)
- for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
- HVX_Vector in_vec = *vec_in++;
- HVX_Vector temp_v = in_vec;
+ if(unalign_address == 0){
+ HVX_Vector * restrict vec_in = (HVX_Vector *) src;
+ HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
- HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
- HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
- in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
- in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
- *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
+ #pragma unroll(4)
+ for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+ HVX_Vector in_vec = *vec_in++;
+ HVX_Vector temp_v = in_vec;
+
+ HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+ HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+ in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+ in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+ *vec_out++ = in_vec;
+ }
+
+ }else{
+
+ HVX_UVector * restrict vec_in = (HVX_UVector *) src;
+ HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+ #pragma unroll(4)
+ for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+ HVX_Vector in_vec = *vec_in++;
+ HVX_Vector temp_v = in_vec;
+
+ HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+ HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+ in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+ in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+ *vec_out++ = in_vec;
+ }
+
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
- HVX_Vector in = *(HVX_UVector *) srcf;
+ HVX_Vector in_vec = *(HVX_UVector *) srcf;
- HVX_Vector temp_v = in;
+ HVX_Vector temp_v = in_vec;
- HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
- HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
+ HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+ HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
- in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
- in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
+ in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+ in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
- hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
+ hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
}
}