#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
+#include <syclcompat/math.hpp>
#include <oneapi/mkl.hpp>
#include <map>
: id);
}
- template <typename T>
- sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
- {
- return sycl::vec<T, 1>(val)
- .template as<sycl::vec<
- std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
- .template convert<T>();
- }
-
- template <typename T1, typename T2>
- using dot_product_acc_t =
- std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
- uint32_t, int32_t>;
-
template <typename T1, typename T2, typename T3>
inline auto dp4a(T1 a, T2 b, T3 c)
{
- dot_product_acc_t<T1, T2> res = c;
- auto va = extract_and_sign_or_zero_extend4(a);
- auto vb = extract_and_sign_or_zero_extend4(b);
- res += va[0] * vb[0];
- res += va[1] * vb[1];
- res += va[2] * vb[2];
- res += va[3] * vb[3];
- return res;
+ return syclcompat::dp4a(a, b, c);
}
struct sub_sat
grid1[0] ^ signs[0], signs[0], std::minus<>());
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
grid2[0] ^ signs[1], signs[1], std::minus<>());
- sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
- sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+ sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+ sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
q8 += 8;
aux32 >>= 7;
}
grid1[0] ^ signs0, signs0, std::minus<>());
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
grid2[0] ^ signs1, signs1, std::minus<>());
- sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
- sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+ sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+ sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
q8 += 8;
}
const float d =