#include "common.hpp"
+#include "ggml-sycl/presets.hpp"
#include "ggml.h"
#include "element_wise.hpp"
+#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
+ for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))
+
+#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \
+ (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX))
+
+
static void acc_f32(const float * x, const float * y, float * dst, const int ne,
const int ne10, const int ne11, const int ne12,
- const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
+ const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) {
+ const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0);
if (i >= ne) {
return;
}
}
}
+/* Unary OP funcs */
template<typename T>
-static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
- for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
- dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
- }
+static __dpct_inline__ T op_sgn(T x) {
+ return x > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
}
template<typename T>
-static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
- for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
- dst[i] = sycl::fabs(x[i]);
- }
+static __dpct_inline__ T op_abs(T x) {
+ return sycl::fabs(x);
}
template<typename T>
-static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
- for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
- dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
- }
+static __dpct_inline__ T op_elu(T x) {
+ return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
}
template<typename T>
-static void gelu(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
+static __dpct_inline__ T op_gelu(T x) {
const T GELU_COEF_A = static_cast<T>(0.044715f);
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
-
- float xi = x[i];
- dst[i] = static_cast<T>(0.5f) * xi *
- (static_cast<T>(1.0f) +
- sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast<T>(1.0f) + GELU_COEF_A * xi * xi)));
+ return static_cast<T>(0.5f) * x *
+ (static_cast<T>(1.0f) +
+ sycl::tanh(SQRT_2_OVER_PI * x * (static_cast<T>(1.0f) + GELU_COEF_A * x * x)));
}
template<typename T>
-static void silu(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
+static __dpct_inline__ T op_silu(T x) {
+ return x / (static_cast<T>(1.0f) + sycl::native::exp(-x));
}
template<typename T>
-static void gelu_quick(const T *x, T *dst, int k,
- const sycl::nd_item<3> &item_ct1) {
- const float GELU_QUICK_COEF = -1.702f;
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
- if (i >= k) {
- return;
- }
- dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+static __dpct_inline__ T op_gelu_quick(T x) {
+ const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f);
+ return x * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x)));
}
template<typename T>
-static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
+static __dpct_inline__ T op_gelu_erf(T x) {
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
- for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
- auto x_i = x[i];
- dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
- }
+ return static_cast<T>(0.5f) * x * (static_cast<T>(1.0f) + sycl::erf(x * SQRT_2_INV));
}
template<typename T>
-static void tanh(const T *x, T *dst, int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
- if (i >= k) {
- return;
- }
- dst[i] = sycl::tanh((x[i]));
+static __dpct_inline__ T op_tanh(T x) {
+ return sycl::tanh(x);
}
template<typename T>
-static void relu(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
+static __dpct_inline__ T op_relu(T x) {
+ return sycl::fmax(x, static_cast<T>(0));
}
template<typename T>
-static void sigmoid(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = 1.0f / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
+static __dpct_inline__ T op_sigmoid(T x) {
+ return static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x));
}
template<typename T>
-static void sqrt(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = sycl::sqrt(x[i]);
+static __dpct_inline__ T op_sqrt(T x) {
+ return sycl::sqrt(x);
}
template<typename T>
-static void sin(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = sycl::sin(x[i]);
+static __dpct_inline__ T op_sin(T x) {
+ return sycl::sin(x);
}
template<typename T>
-static void cos(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = sycl::cos(x[i]);
+static __dpct_inline__ T op_cos(T x) {
+ return sycl::cos(x);
}
template<typename T>
-static void hardsigmoid(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+static __dpct_inline__ T op_hardsigmoid(T x) {
+ return sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
template<typename T>
-static void hardswish(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
- }
- dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+static __dpct_inline__ T op_hardswish(T x) {
+ return x * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
template<typename T>
-static void exp(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
+static __dpct_inline__ T op_exp(T x) {
+ return sycl::exp(x);
+}
- if (i >= k) {
- return;
+template<typename T>
+static __dpct_inline__ T op_log(T x) {
+ if (x <= static_cast<T>(0)) {
+ return neg_infinity<T>();
}
- dst[i] = sycl::exp(x[i]);
+ return sycl::log(x);
}
template<typename T>
-static void log(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
+static __dpct_inline__ T op_neg(T x) {
+ return -x;
+}
- if (i >= k) {
- return;
- }
- T xi = x[i];
- if (xi <= 0) {
- dst[i] = neg_infinity<T>();
- } else {
- dst[i] = sycl::log(xi);
- }
+template<typename T>
+static __dpct_inline__ T op_step(T x) {
+ return (x > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
}
template<typename T>
-static void neg(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
+static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) {
+ T neg_slope_T = static_cast<T>(negative_slope);
+ return sycl::fmax(x, static_cast<T>(0)) +
+ sycl::fmin(x, static_cast<T>(0.0f)) * neg_slope_T;
+}
- if (i >= k) {
- return;
- }
- dst[i] = -x[i];
+template<typename T>
+static __dpct_inline__ T op_sqr(T x) {
+ return x * x;
}
template<typename T>
-static void step(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
+static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) {
+ return x < static_cast<T>(min_val) ? static_cast<T>(min_val) : (x > static_cast<T>(max_val) ? static_cast<T>(max_val) : x);
+}
- if (i >= k) {
- return;
+template<typename T>
+static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_sgn(x[i]);
}
- dst[i] = x[i] > static_cast<T>(0.0f);
}
template<typename T>
-static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
- if (i >= k) {
- return;
+static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_abs(x[i]);
}
- dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
- sycl::fmin((x[i]), static_cast<T>(0.0f)) * negative_slope;
}
template<typename T>
-static void sqr(const T * x, T * dst, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
+static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_elu(x[i]);
}
- dst[i] = x[i] * x[i];
}
-template<typename T>
-static void upscale(const T *x, T *dst, const int nb00, const int nb01,
- const int nb02, const int nb03, const int ne10, const int ne11,
- const int ne12, const int ne13, const float sf0, const float sf1,
- const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
- int index = item_ct1.get_local_id(0) +
- item_ct1.get_group(0) * item_ct1.get_local_range(0);
- if (index >= ne10 * ne11 * ne12 * ne13) {
- return;
+template<typename T>
+static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_gelu(x[i]);
}
- // operation
- int i10 = index % ne10;
- int i11 = (index / ne10) % ne11;
- int i12 = (index / (ne10 * ne11)) % ne12;
- int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
-
- int i00 = i10 / sf0;
- int i01 = i11 / sf1;
- int i02 = i12 / sf2;
- int i03 = i13 / sf3;
-
- dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
}
-template <typename T>
-static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
- const sycl::nd_item<3> &item_ct1) {
- int nidx = item_ct1.get_local_id(2) +
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
- if (nidx >= ne0) {
- return;
+template<typename T>
+static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_silu(x[i]);
}
+}
- // operation
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
- if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
- int offset_src = nidx + item_ct1.get_group(1) * ne00 +
- item_ct1.get_group(0) * ne00 * ne01;
- dst[offset_dst] = x[offset_src];
- } else {
- dst[offset_dst] = static_cast<T>(0.0f);
+template<typename T>
+static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_gelu_quick(x[i]);
}
}
-
template<typename T>
-static void clamp(const T * x, T * dst, const float min, const float max, const int k,
- const sycl::nd_item<3> &item_ct1) {
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2);
-
- if (i >= k) {
- return;
+static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_gelu_erf(x[i]);
}
-
- dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
}
-static void acc_f32_sycl(const float *x, const float *y, float *dst,
- const int n_elements, const int ne10, const int ne11,
- const int ne12, const int nb1, const int nb2,
- const int offset, queue_ptr stream) {
- int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) {
- acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);
- });
+template<typename T>
+static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_tanh(x[i]);
+ }
}
template<typename T>
-static void gelu_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); });
+static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_relu(x[i]);
+ }
}
template<typename T>
-static void silu_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); });
+static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_sigmoid(x[i]);
+ }
}
template<typename T>
-static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
- // hard code for now
- const int num_blocks = ceil_div(k, 256);
- sycl_parallel_for(
- stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)),
- [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); });
+static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_sqrt(x[i]);
+ }
}
template<typename T>
-static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
- // hard code for now
- const int num_blocks = ceil_div(k, 256);
- sycl_parallel_for(
- stream,
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
- [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); });
+static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_sin(x[i]);
+ }
}
-
template<typename T>
-static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
- // hard code for now
- const int num_blocks = ceil_div(k, 256);
- sycl_parallel_for(
- stream,
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
- [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); });
+static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_cos(x[i]);
+ }
}
template<typename T>
-static void gelu_quick_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); });
+static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_hardsigmoid(x[i]);
+ }
}
-
template<typename T>
-static void gelu_erf_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); });
+static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_hardswish(x[i]);
+ }
}
template<typename T>
-static void tanh_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); });
+static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_exp(x[i]);
+ }
}
template<typename T>
-static void relu_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); });
+static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_log(x[i]);
+ }
}
template<typename T>
-static void hardsigmoid_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
- sycl_parallel_for(
- stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); });
+static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_neg(x[i]);
+ }
}
template<typename T>
-static void hardswish_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
- sycl_parallel_for(
- stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); });
+static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_step(x[i]);
+ }
}
template<typename T>
-static void exp_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); });
+static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_leaky_relu(x[i], negative_slope);
+ }
}
template<typename T>
-static void log_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); });
+static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_sqr(x[i]);
+ }
}
template<typename T>
-static void neg_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); });
+static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = op_clamp(x[i], min_val, max_val);
+ }
}
-template<typename T>
-static void step_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); });
+template<typename T>
+static void upscale(const T *x, T *dst, const int nb00, const int nb01,
+ const int nb02, const int nb03, const int ne10, const int ne11,
+ const int ne12, const int ne13, const float sf0, const float sf1,
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
+ int index = item_ct1.get_local_id(0) +
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
+ if (index >= ne10 * ne11 * ne12 * ne13) {
+ return;
+ }
+ // operation
+ int i10 = index % ne10;
+ int i11 = (index / ne10) % ne11;
+ int i12 = (index / (ne10 * ne11)) % ne12;
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
+
+ int i00 = static_cast<int>(i10 / sf0);
+ int i01 = static_cast<int>(i11 / sf1);
+ int i02 = static_cast<int>(i12 / sf2);
+ int i03 = static_cast<int>(i13 / sf3);
+
+ dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
}
-template<typename T>
-static void sigmoid_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
- sycl_parallel_for(
- stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); });
+template <typename T>
+static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+ const sycl::nd_item<3> &item_ct1) {
+ int nidx = SYCL_LOCAL_ID_CALC(item_ct1, 2);
+ if (nidx >= ne0) {
+ return;
+ }
+
+ // operation
+ int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+ if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
+ int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+ item_ct1.get_group(0) * ne00 * ne01;
+ dst[offset_dst] = x[offset_src];
+ } else {
+ dst[offset_dst] = static_cast<T>(0.0f);
+ }
}
template<typename T>
-static void sqrt_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); });
+static void clamp(const T * x, T * dst, const float min, const float max, const int k,
+ const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
+ }
}
template<typename T>
-static void sin_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); });
+static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ const int64_t j0 = (i / n) * o0 + (i % n);
+ const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
+ dst[i] = op_gelu(x[j0]) * g[j1];
+ }
}
template<typename T>
-static void cos_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); });
+static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ const int64_t j0 = (i / n) * o0 + (i % n);
+ const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
+ dst[i] = op_relu(x[j0]) * g[j1];
+ }
}
template<typename T>
-static void leaky_relu_sycl(const T *x, T *dst, const int k,
- const float negative_slope,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); });
+static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
+ SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+ const int64_t j0 = (i / n) * o0 + (i % n);
+ const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
+ dst[i] = op_silu(x[j0]) * g[j1];
+ }
}
-template<typename T>
-static void sqr_sycl(const T *x, T *dst, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
+namespace ggml_sycl_detail {
+static void acc_f32_sycl(const float *x, const float *y, float *dst,
+ const int n_elements, const int ne10, const int ne11,
+ const int ne12, const int nb1, const int nb2,
+ const int offset, queue_ptr stream) {
+ int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); });
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) *
+ sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
+ sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+ item_ct1);
+ });
}
template<typename T>
const int ne12, const int ne13, const float sf0, const float sf1,
const float sf2, const float sf3, queue_ptr stream) {
int dst_size = ne10 * ne11 * ne12 * ne13;
- int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
+ int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE);
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
sycl_parallel_for<1>(
stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
static void pad_sycl(const T *x, T *dst, const int ne00,
const int ne01, const int ne02, const int ne0,
const int ne1, const int ne2, queue_ptr stream) {
- int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
+ int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE);
sycl::range<3> gridDim(ne2, ne1, num_blocks);
sycl_parallel_for(stream,
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
[=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
}
-template<typename T>
-static void clamp_sycl(const T *x, T *dst, const float min,
- const float max, const int k,
- queue_ptr stream) {
- const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
- sycl_parallel_for(stream,
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); });
-}
-
-inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+template<typename KernelInvoker, typename... Args>
+static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-
#else
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
case GGML_TYPE_F16:
{
auto data_pts = cast_data<sycl::half>(dst);
- sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward<Args>(args)...);
break;
}
#endif
case GGML_TYPE_F32:
{
auto data_pts = cast_data<float>(dst);
- sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward<Args>(args)...);
break;
}
default:
}
}
-inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+template<typename KernelInvoker, typename... Args>
+static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-
#else
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(dst->src[0]->type == dst->type);
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * src1 = dst->src[1];
+ const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;;
+ GGML_ASSERT(dst->ne[0] == nc);
+ GGML_ASSERT(ggml_is_contiguous_1(dst->src[0]));
+ GGML_ASSERT(ggml_is_contiguous(dst));
+ const int32_t swapped = ((const int32_t *) dst->op_params)[1];
+ void * src0_d = src0->data;
+ void * src1_d = src1 ? src1->data : src0->data;
+ const int64_t src0_o = src0->nb[1];
+ const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+ void * dst_d = dst->data;
+ if (src1) {
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
+ GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
+ GGML_ASSERT(src1->ne[0] == nc);
+ GGML_ASSERT(src0->type == src1->type);
}
-}
-
-
-inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
switch (dst->type) {
#if defined (GGML_SYCL_F16)
case GGML_TYPE_F16:
{
- auto data_pts = cast_data<sycl::half>(dst);
- elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ sycl::half * src0_p = (sycl::half *) src0_d;
+ sycl::half * src1_p = (sycl::half *) src1_d;
+
+ if (!src1) {
+ src0_p += swapped ? nc : 0;
+ src1_p += swapped ? 0 : nc;
+ }
+ kernel_invoker(src0_p,
+ src1_p,
+ (sycl::half *) dst_d,
+ ggml_nelements(dst),
+ nc,
+ src0_o / sizeof(sycl::half),
+ src1_o / sizeof(sycl::half),
+ main_stream,
+ std::forward<Args>(args)...);
break;
}
#endif
case GGML_TYPE_F32:
{
- auto data_pts = cast_data<float>(dst);
- elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ float * src0_p = (float *) src0_d;
+ float * src1_p = (float *) src1_d;
+
+ if (!src1) {
+ src0_p += swapped ? nc : 0;
+ src1_p += swapped ? 0 : nc;
+ }
+
+ kernel_invoker(src0_p,
+ src1_p,
+ (float *) dst_d,
+ ggml_nelements(dst),
+ nc,
+ src0_o / sizeof(float),
+ src1_o / sizeof(float),
+ main_stream,
+ std::forward<Args>(args)...);
break;
}
default:
}
}
-inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+template<typename KernelInvoker, typename... Args>
+static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
#endif
GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
-}
-inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+ const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
+ const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
+ const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
+ const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
switch (dst->type) {
#if defined (GGML_SYCL_F16)
case GGML_TYPE_F16:
{
auto data_pts = cast_data<sycl::half>(dst);
- gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
+ (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
+ main_stream, std::forward<Args>(args)...);
break;
}
#endif
case GGML_TYPE_F32:
{
auto data_pts = cast_data<float>(dst);
- gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
+ (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
+ main_stream, std::forward<Args>(args)...);
break;
}
default:
}
}
-inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+template<typename KernelInvoker, typename... Args>
+static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
#endif
GGML_ASSERT(dst->src[0]->type == dst->type);
+ GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
switch (dst->type) {
case GGML_TYPE_F16:
{
auto data_pts = cast_data<sycl::half>(dst);
- gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0],
+ (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward<Args>(args)...);
break;
}
#endif
case GGML_TYPE_F32:
{
auto data_pts = cast_data<float>(dst);
- gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+ kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0],
+ (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward<Args>(args)...);
break;
}
default:
}
}
-inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
-}
-
-
-inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
-}
-
-inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
-}
+} // namespace ggml_sycl_detail
-inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, 256);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
+ sycl::range<1>(256)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, 256);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
+ sycl::range<1>(256)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, 256);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
+ sycl::range<1>(256)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
-
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-
-inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+
+static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
+ sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
+ sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
+static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
+ sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
- GGML_ASSERT(dst->src[0]->type == dst->type);
- float negative_slope;
- memcpy(&negative_slope, dst->op_params, sizeof(float));
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
+ sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
- #if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
+ sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
+static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
+ sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
+ sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
- const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
- const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
- const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
- const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
- dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
- main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
- dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
- main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined (GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- switch (dst->type) {
-#if defined (GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
- dst->ne[1], dst->ne[2], main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
- dst->ne[1], dst->ne[2], main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
}
-inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-#if defined(GGML_SYCL_F16)
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
-#else
+static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
-#endif
- GGML_ASSERT(dst->src[0]->type == dst->type);
- dpct::queue_ptr main_stream = ctx.stream();
- SYCL_CHECK(ggml_sycl_set_device(ctx.device));
- float min;
- float max;
- memcpy(&min, dst->op_params, sizeof(float));
- memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
- switch (dst->type) {
-#if defined(GGML_SYCL_F16)
- case GGML_TYPE_F16:
- {
- auto data_pts = cast_data<sycl::half>(dst);
- clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
-#endif
- case GGML_TYPE_F32:
- {
- auto data_pts = cast_data<float>(dst);
- clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
- break;
- }
- default:
- GGML_ABORT("GGML tensor type not supported!\n");
- }
+static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ float negative_slope;
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
+ const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
+ sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1);
+ });
+ }, negative_slope);
+}
+
+static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+ const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
+ sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1);
+ });
+ });
+}
+
+static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
+ int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
+ queue_ptr stream) {
+ ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream);
+ });
}
-inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2,
+ queue_ptr stream) {
+ ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream);
+ });
+}
+static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ float min_val;
+ float max_val;
+ memcpy(&min_val, dst->op_params, sizeof(float));
+ memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float));
+ ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+ [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
+ const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
+ sycl_parallel_for(stream,
+ sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
+ sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
+ [=](sycl::nd_item<1> item_ct1) {
+ clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1);
+ });
+ }, min_val, max_val);
+}
+
+static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
int offset = dst->op_params[3] / 4; // offset in bytes
- acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
+ ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
+}
+
+static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
+ [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
+ const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
+ sycl_parallel_for(main_stream,
+ sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
+ gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
+ });
+ });
+}
+
+static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
+ [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
+ const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
+ sycl_parallel_for(main_stream,
+ sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
+ gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
+ });
+ });
+}
+
+static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
+ [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
+ const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
+ sycl_parallel_for(main_stream,
+ sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
+ gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
+ });
+ });
}
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_elu(ctx, dst);
}
+
+void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+ ggml_sycl_op_geglu(ctx, dst);
+}
+
+void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+ ggml_sycl_op_reglu(ctx, dst);
+}
+
+void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+ ggml_sycl_op_swiglu(ctx, dst);
+}