From: shani-f Date: Sat, 15 Nov 2025 23:52:42 +0000 (+0200) Subject: sycl : unify unary kernels with a generic implementation and enable wide operator... X-Git-Tag: upstream/0.9.4.395~170 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=2d58a1baaeae1446ad875ae38ce6e6fd618369fd;p=pkg%2Fggml%2Fsources%2Fggml sycl : unify unary kernels with a generic implementation and enable wide operator support (llama/17213) * SYCL: add generic unary op implementation for multiple ops (ABS/SGN/…); unify non-contiguous access * SYCL: update documentation and sycl.csv to reflect new unary op support * update ops.md after syncing SYCL.csv changes * Fix SYCL.csv merge conflict * Update ops.md after fixing SYCL.csv conflicts * Fix SYCL.csv tail after merge conflict and regenerate ops.md * Fix line endings and final newline in SYCL.csv * Remove TOPK_MOE entries from SYCL.csv as requested * Update ops.md after removing TOPK_MOE from SYCL.csv * Regenerated SYCL.csv and synced ops.md with upstream * Update ops.md using create_ops_docs.py --- diff --git a/src/ggml-sycl/element_wise.cpp b/src/ggml-sycl/element_wise.cpp index 810995d0..7d54ce60 100644 --- a/src/ggml-sycl/element_wise.cpp +++ b/src/ggml-sycl/element_wise.cpp @@ -170,73 +170,31 @@ static __dpct_inline__ T op_trunc(T x) { return sycl::trunc(x); } -template -static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sgn(x[i]); - } -} - -template -static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_abs(x[i]); - } -} - -template -static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_elu(x[i]); - } -} - -template -static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu(x[i]); - } -} - -template -static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_silu(x[i]); - } -} - -template -static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu_quick(x[i]); - } -} - -template -static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { +template +static void unary_op_generic_kernel( + const T * x, + T * dst, + const int k, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, + const size_t nb0, const size_t nb1, const size_t nb2, const size_t nb3, + const size_t nbd0, const size_t nbd1, const size_t nbd2, const size_t nbd3, + const sycl::nd_item<1> & item_ct1, + F func) { + + (void) ne3; SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu_erf(x[i]); - } -} + const int64_t i0 = i % ne0; + const int64_t i1 = (i / ne0) % ne1; + const int64_t i2 = (i / (ne0*ne1)) % ne2; + const int64_t i3 = i / (ne0*ne1*ne2); -template -static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_tanh(x[i]); - } -} + const char * src_base = (const char *) x; + char * dst_base = (char *) dst; -template -static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_relu(x[i]); - } -} + const T * srcp = (const T *)(src_base + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3 ); + T * dstp = (T *)(dst_base + i0*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3); -template -static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sigmoid(x[i]); + *dstp = func(*srcp); } } @@ -261,27 +219,6 @@ static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::n } } -template -static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_hardsigmoid(x[i]); - } -} - -template -static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_hardswish(x[i]); - } -} - -template -static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_exp(x[i]); - } -} - template static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) { @@ -289,19 +226,6 @@ static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::n } } -template -static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_neg(x[i]); - } -} - -template -static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_step(x[i]); - } -} template static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) { @@ -620,6 +544,48 @@ static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx } } +template +static inline void ggml_sycl_op_unary( + ggml_backend_sycl_context & ctx, ggml_tensor * dst, F func) { + + ggml_tensor * src0 = dst->src[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const size_t nb0 = src0->nb[0]; + const size_t nb1 = src0->nb[1]; + const size_t nb2 = src0->nb[2]; + const size_t nb3 = src0->nb[3]; + + const size_t nbd0 = dst->nb[0]; + const size_t nbd1 = dst->nb[1]; + const size_t nbd2 = dst->nb[2]; + const size_t nbd3 = dst->nb[3]; + + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [=](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + + const int num_blocks = ceil_div(k_elements, 256); + + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_generic_kernel( + src, dst_ptr, k_elements, + ne0, ne1, ne2, ne3, + nb0, nb1, nb2, nb3, + nbd0, nbd1, nbd2, nbd3, + item_ct1, + func + ); + }); + }); +} + static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -645,159 +611,75 @@ static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_ten static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_sgn(x); + }); } + static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_abs(x); + }); } static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_elu(x); + }); } - static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), - sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_silu(x); + }); } static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_gelu(x); + }); } -static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); +static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_gelu_quick(x); + }); } -static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); +static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_gelu_erf(x); + }); } static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), - sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_tanh(x); + }); } static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), - sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_relu(x); + }); } static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), - sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_hardsigmoid(x); + }); } static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), - sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_hardswish(x); + }); } static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), - sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_exp(x); + }); } static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { @@ -814,42 +696,22 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor } static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), - sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_neg(x); + }); } + static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), - sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_step(x); + }); } static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), - sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_sigmoid(x); + }); } static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/src/ggml-sycl/ggml-sycl.cpp b/src/ggml-sycl/ggml-sycl.cpp index 941fd41c..3f1bdfb9 100644 --- a/src/ggml-sycl/ggml-sycl.cpp +++ b/src/ggml-sycl/ggml-sycl.cpp @@ -4360,21 +4360,22 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: - case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: - case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_SGN: - case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: + return true; case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_CEIL: case GGML_UNARY_OP_ROUND: