return sycl::trunc(x);
}
-template<typename T>
-static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_sgn(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_abs(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_elu(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_gelu(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_silu(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_gelu_quick(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+template<typename T, typename F>
+static void unary_op_generic_kernel(
+ const T * x,
+ T * dst,
+ const int k,
+ const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3,
+ const size_t nb0, const size_t nb1, const size_t nb2, const size_t nb3,
+ const size_t nbd0, const size_t nbd1, const size_t nbd2, const size_t nbd3,
+ const sycl::nd_item<1> & item_ct1,
+ F func) {
+
+ (void) ne3;
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_gelu_erf(x[i]);
- }
-}
+ const int64_t i0 = i % ne0;
+ const int64_t i1 = (i / ne0) % ne1;
+ const int64_t i2 = (i / (ne0*ne1)) % ne2;
+ const int64_t i3 = i / (ne0*ne1*ne2);
-template<typename T>
-static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_tanh(x[i]);
- }
-}
+ const char * src_base = (const char *) x;
+ char * dst_base = (char *) dst;
-template<typename T>
-static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_relu(x[i]);
- }
-}
+ const T * srcp = (const T *)(src_base + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3 );
+ T * dstp = (T *)(dst_base + i0*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3);
-template<typename T>
-static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_sigmoid(x[i]);
+ *dstp = func(*srcp);
}
}
}
}
-template<typename T>
-static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_hardsigmoid(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_hardswish(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_exp(x[i]);
- }
-}
-
template<typename T>
static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
}
}
-template<typename T>
-static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_neg(x[i]);
- }
-}
-
-template<typename T>
-static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
- SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
- dst[i] = op_step(x[i]);
- }
-}
template<typename T>
static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
}
}
+template<typename F>
+static inline void ggml_sycl_op_unary(
+ ggml_backend_sycl_context & ctx, ggml_tensor * dst, F func) {
+
+ ggml_tensor * src0 = dst->src[0];
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+
+ const size_t nb0 = src0->nb[0];
+ const size_t nb1 = src0->nb[1];
+ const size_t nb2 = src0->nb[2];
+ const size_t nb3 = src0->nb[3];
+
+ const size_t nbd0 = dst->nb[0];
+ const size_t nbd1 = dst->nb[1];
+ const size_t nbd2 = dst->nb[2];
+ const size_t nbd3 = dst->nb[3];
+
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [=](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+
+ const int num_blocks = ceil_div(k_elements, 256);
+
+ stream->parallel_for(
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
+ sycl::range<1>(256)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_generic_kernel(
+ src, dst_ptr, k_elements,
+ ne0, ne1, ne2, ne3,
+ nb0, nb1, nb2, nb3,
+ nbd0, nbd1, nbd2, nbd3,
+ item_ct1,
+ func
+ );
+ });
+ });
+}
+
static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, 256);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
- sycl::range<1>(256)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_sgn(x);
+ });
}
+
static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, 256);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
- sycl::range<1>(256)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_abs(x);
+ });
}
static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, 256);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
- sycl::range<1>(256)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_elu(x);
+ });
}
-
static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
- sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_silu(x);
+ });
}
static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
- sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_gelu(x);
+ });
}
-static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
- sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_gelu_quick(x);
+ });
}
-static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
- sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_gelu_erf(x);
+ });
}
static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
- sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_tanh(x);
+ });
}
static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
- sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_relu(x);
+ });
}
static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
- sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_hardsigmoid(x);
+ });
}
static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
- sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_hardswish(x);
+ });
}
static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
- sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_exp(x);
+ });
}
static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
}
static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
- sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_neg(x);
+ });
}
+
static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
- sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_step(x);
+ });
}
static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
- [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
- const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
- stream->parallel_for(
- sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
- sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
- [=](sycl::nd_item<1> item_ct1) {
- unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
- });
- });
+ ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+ return op_sigmoid(x);
+ });
}
static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {