}
#endif
+#if defined(__riscv_v_intrinsic)
+template <> inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
+ return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
+}
+template <> inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
+ return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
+}
+template <> inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
+ return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
+}
+template <> inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
+ return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
+}
+#endif
+
#if defined(__riscv_zvfh)
-template <>
-inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
+template <> inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
-inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
+template <> inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
-inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
+template <> inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
-inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
+template <> inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
return __riscv_vfwmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
}
-inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
- return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
-}
-inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
- return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
-}
-inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
- return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
-}
-inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
- return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
-}
#endif
#if defined(__riscv_zvfbfwma)
-inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
+template <> inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmaccbf16_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
-inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
+template <> inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmaccbf16_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
-inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
+template <> inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmaccbf16_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
+template <> inline vfloat32m8_t madd(vbfloat16m4_t a, vbfloat16m4_t b, vfloat32m8_t c) {
+ return __riscv_vfwmaccbf16_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
+}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
}
#endif // __AVX512F__
-#if defined(__riscv_zvfh)
+#if defined(__riscv_v_intrinsic)
inline float hsum(vfloat32m1_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m1_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m1()));
}
#endif
+#if defined(__riscv_v_intrinsic)
+template <> inline vfloat32m1_t load(const float *p) {
+ return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
+}
+template <> inline vfloat32m2_t load(const float *p) {
+ return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
+}
+template <> inline vfloat32m4_t load(const float *p) {
+ return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
+}
+template <> inline vfloat32m8_t load(const float *p) {
+ return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
+}
+#endif
+
#if defined(__riscv_zvfh)
template <> inline vfloat16mf2_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16mf2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16mf2());
template <> inline vfloat16m4_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m4(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m4());
}
-template <> inline vfloat32m1_t load(const float *p) {
- return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
-}
-template <> inline vfloat32m2_t load(const float *p) {
- return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
-}
-template <> inline vfloat32m4_t load(const float *p) {
- return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
-}
-template <> inline vfloat32m8_t load(const float *p) {
- return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
-}
#endif
#if defined(__riscv_zvfbfwma)
template <> inline vbfloat16m2_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m2());
}
+template <> inline vbfloat16m4_t load(const ggml_bf16_t *p) {
+ return __riscv_vle16_v_bf16m4(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m4());
+}
#endif
-#if defined(__riscv_zvfh)
+#if defined(__riscv_v_intrinsic)
template <typename T> T set_zero();
-template <> inline vfloat16mf2_t set_zero() {
- return __riscv_vfmv_v_f_f16mf2(0, __riscv_vsetvlmax_e16mf2());
-}
-template <> inline vfloat16m1_t set_zero() {
- return __riscv_vfmv_v_f_f16m1(0, __riscv_vsetvlmax_e16m1());
-}
-template <> inline vfloat16m2_t set_zero() {
- return __riscv_vfmv_v_f_f16m2(0, __riscv_vsetvlmax_e16m2());
-}
-template <> inline vfloat16m4_t set_zero() {
- return __riscv_vfmv_v_f_f16m4(0, __riscv_vsetvlmax_e16m4());
-}
template <> inline vfloat32m1_t set_zero() {
return __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvlmax_e32m1());
}
#if defined(__riscv_v_intrinsic)
template <typename T> size_t vlmax() {
- if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
- else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
- else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
- else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
- else if constexpr (std::is_same_v<T, vfloat32m1_t>) { return __riscv_vsetvlmax_e32m1(); }
+ if constexpr (std::is_same_v<T, vfloat32m1_t>) { return __riscv_vsetvlmax_e32m1(); }
else if constexpr (std::is_same_v<T, vfloat32m2_t>) { return __riscv_vsetvlmax_e32m2(); }
else if constexpr (std::is_same_v<T, vfloat32m4_t>) { return __riscv_vsetvlmax_e32m4(); }
else if constexpr (std::is_same_v<T, vfloat32m8_t>) { return __riscv_vsetvlmax_e32m8(); }
+ #if defined (__riscv_zvfh)
+ else if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
+ else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
+ else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
+ else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
+ #endif
+ #if defined (__riscv_zvfbfwma)
+ else if constexpr (std::is_same_v<T, vbfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
+ else if constexpr (std::is_same_v<T, vbfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
+ else if constexpr (std::is_same_v<T, vbfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
+ else if constexpr (std::is_same_v<T, vbfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
+ #endif
return 0;
}
#endif
params->ith, params->nth};
tb.matmul(m, n);
return true;
-#elif defined(__riscv_zvfh)
+#elif defined(__riscv_v_intrinsic)
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vfloat32m1_t, float, float, float> tb{ params,
k, (const float *)A, lda,
return true;
}
#elif defined(__riscv_zvfbfwma)
- #if LMUL == 1
- tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
- k, (const ggml_bf16_t *)A, lda,
- (const ggml_bf16_t *)B, ldb,
- (float *)C, ldc};
- #elif LMUL == 2
- tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
- k, (const ggml_bf16_t *)A, lda,
- (const ggml_bf16_t *)B, ldb,
- (float *)C, ldc};
- #else // LMUL = 4
- tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
- k, (const ggml_bf16_t *)A, lda,
- (const ggml_bf16_t *)B, ldb,
- (float *)C, ldc};
- #endif
- return tb.matmul(m, n);
+ if (Btype == GGML_TYPE_BF16) {
+ #if LMUL == 1
+ tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+ k, (const ggml_bf16_t *)A, lda,
+ (const ggml_bf16_t *)B, ldb,
+ (float *)C, ldc};
+ #elif LMUL == 2
+ tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+ k, (const ggml_bf16_t *)A, lda,
+ (const ggml_bf16_t *)B, ldb,
+ (float *)C, ldc};
+ #else // LMUL = 4
+ tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
+ k, (const ggml_bf16_t *)A, lda,
+ (const ggml_bf16_t *)B, ldb,
+ (float *)C, ldc};
+ #endif
+ return tb.matmul(m, n);
+ }
#endif
return false;
}
const int ggml_f16_epr = sve_register_length / 16; // running when 16
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
- const int np = (n & ~(ggml_f16_step - 1));
+ int np = (n & ~(ggml_f16_step - 1));
svfloat16_t sum_00 = svdup_n_f16(0.0f);
svfloat16_t sum_01 = svdup_n_f16(0.0f);
}
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
+ np = n;
+ #elif defined(__riscv_v_intrinsic)
+ #if defined(__riscv_zvfh)
+ size_t vl = __riscv_vsetvlmax_e32m4();
+
+ // initialize accumulators to all zeroes
+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+ // calculate step size
+ const size_t epr = __riscv_vsetvlmax_e16m2();
+ const size_t step = epr * 2;
+ int np = (n & ~(step - 1));
+
+ // unroll by 2 along the row dimension
+ for (int i = 0; i < np; i += step) {
+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
+ vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
+ vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
+
+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
+ vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
+ vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
+ }
- #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
- size_t vl = __riscv_vsetvlmax_e32m4();
-
- // initialize accumulators to all zeroes
- vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
- vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
- vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
- vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
-
- // calculate step size
- const size_t epr = __riscv_vsetvlmax_e16m2();
- const size_t step = epr * 2;
- const int np = (n & ~(step - 1));
-
- // unroll by 2 along the row dimension
- for (int i = 0; i < np; i += step) {
- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
- vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
- vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
- vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
- vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
-
- vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
- vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
- vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
- vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
- vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
- }
-
- vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
- vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
-
- // leftovers
- for (int i = np; i < n; i += vl) {
- vl = __riscv_vsetvl_e16m2(n - i);
- vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
- vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
- vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
- vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
- vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
- }
+ // leftovers
+ for (int i = np; i < n; i += vl) {
+ vl = __riscv_vsetvl_e16m2(n - i);
+ vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
- // reduce
- vl = __riscv_vsetvlmax_e32m2();
- vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
- __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
- vl = __riscv_vsetvlmax_e32m1();
- vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
- __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
- vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
- acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
-
- vl = __riscv_vsetvlmax_e32m2();
- vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
- __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
- vl = __riscv_vsetvlmax_e32m1();
- vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
- __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
- vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
- acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
- sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
- sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
+ vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
+ vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
+ }
+ // reduce
+ vl = __riscv_vsetvlmax_e32m2();
+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
+ __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
+ vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
+ __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
+ acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+
+ vl = __riscv_vsetvlmax_e32m2();
+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
+ __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
+ vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
+ __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
+ acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+ sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
+ sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
+ np = n;
+ #else
+ const int np = 0;
+ #endif
#else
const int np = (n & ~(GGML_F16_STEP - 1));
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
}
-
- // leftovers
- for (int i = np; i < n; ++i) {
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
- }
- }
#endif
#else
- for (int i = 0; i < n; ++i) {
+ // scalar path
+ const int np = 0;
+#endif
+ // scalar and leftovers
+ for (int i = np; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
}
-#endif
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
s[i] = (float)sumf[i];
svst1_f16(pg, (__fp16 *)(y + np2), hy);
}
np = n;
-#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
- const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
- const _Float16 scale = *(const _Float16*)(&s);
-
- // calculate step size
- const int epr = __riscv_vsetvlmax_e16m4();
- const int step = epr * 2;
- int np = (n & ~(step - 1));
-
- // unroll by 2
- for (int i = 0; i < np; i += step) {
- vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
- vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
- ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
- __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
- __asm__ __volatile__ ("" ::: "memory");
-
- vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
- vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
- ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
- __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
- __asm__ __volatile__ ("" ::: "memory");
- }
+#elif defined(__riscv_v_intrinsic) // implies __riscv_v_intrinsic
+ #if defined (__riscv_zvfh)
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+ const _Float16 scale = *(const _Float16*)(&s);
- // leftovers
- int vl;
- for (int i = np; i < n; i += vl) {
- vl = __riscv_vsetvl_e16m4(n - i);
- vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
- vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
- ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
- __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
- }
- np = n;
+ // calculate step size
+ const int epr = __riscv_vsetvlmax_e16m4();
+ const int step = epr * 2;
+ int np = (n & ~(step - 1));
+
+ // unroll by 2
+ for (int i = 0; i < np; i += step) {
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+ __asm__ __volatile__ ("" ::: "memory");
+
+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+ ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+ __asm__ __volatile__ ("" ::: "memory");
+ }
+
+ // leftovers
+ int vl;
+ for (int i = np; i < n; i += vl) {
+ vl = __riscv_vsetvl_e16m4(n - i);
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
+ }
+ np = n;
+ #else
+ // fall to scalar path
+ const int np = 0;
+ #endif
#elif defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
}
}
#else
+ // scalar path
const int np = 0;
#endif
- // leftovers
+ // scalar and leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
const int ggml_f16_step = 2 * ggml_f16_epr;
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
- const int np = (n & ~(ggml_f16_step - 1));
+ int np = (n & ~(ggml_f16_step - 1));
svfloat16_t ay1, ay2;
for (int i = 0; i < np; i += ggml_f16_step) {
svfloat16_t out = svmul_f16_m(pg, hy, vx);
svst1_f16(pg, (__fp16 *)(y + np), out);
}
-#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
- const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
- const _Float16 scale = *(const _Float16*)(&s);
-
- // calculate step size
- const int epr = __riscv_vsetvlmax_e16m4();
- const int step = epr * 2;
- const int np = (n & ~(step - 1));
+ np = n;
+#elif defined(__riscv_v_intrinsic)
+ #if defined(__riscv_zvfh)
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+ const _Float16 scale = *(const _Float16*)(&s);
- // unroll by 2
- for (int i = 0; i < np; i += step) {
- vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
- ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
- __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
- __asm__ __volatile__ ("" ::: "memory");
+ // calculate step size
+ const int epr = __riscv_vsetvlmax_e16m4();
+ const int step = epr * 2;
+ int np = (n & ~(step - 1));
- vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
- ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
- __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
- __asm__ __volatile__ ("" ::: "memory");
- }
+ // unroll by 2
+ for (int i = 0; i < np; i += step) {
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+ __asm__ __volatile__ ("" ::: "memory");
+
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+ ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+ __asm__ __volatile__ ("" ::: "memory");
+ }
- // leftovers
- int vl;
- for (int i = np; i < n; i += vl) {
- vl = __riscv_vsetvl_e16m4(n - i);
- vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
- ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
- __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
- }
+ // leftovers
+ int vl;
+ for (int i = np; i < n; i += vl) {
+ vl = __riscv_vsetvl_e16m4(n - i);
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
+ }
+ np = n;
+ #else
+ // fall to scalar path
+ const int np = 0;
+ #endif
#elif defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
}
-
- // leftovers
- for (int i = np; i < n; ++i) {
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
- }
#else
- // scalar
- for (int i = 0; i < n; ++i) {
+ // scalar path
+ const int np = 0;
+#endif
+ // scalar and leftovers
+ for (int i = np; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
}
-#endif
}
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }