cl_program program_rms_norm;
cl_program program_group_norm;
cl_program program_rope;
- cl_program program_scale;
cl_program program_silu;
cl_program program_sigmoid;
cl_program program_softmax_f32;
cl_program program_softmax_4_f16;
cl_program program_argsort_f32_i32;
cl_program program_sum_rows_f32;
- cl_program program_repeat;
cl_program program_pad;
- cl_program program_tanh;
cl_program program_upscale;
- cl_program program_concat;
cl_program program_conv_2d_f16;
cl_program program_conv_2d_f32;
cl_program program_conv_2d_f16_f32;
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
cl_kernel kernel_add_id;
- cl_kernel kernel_scale;
+ cl_kernel kernel_scale_f32, kernel_scale_f32_4;
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
cl_kernel kernel_mean_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32;
- cl_kernel kernel_repeat;
+ cl_kernel kernel_repeat_f32;
cl_kernel kernel_pad;
- cl_kernel kernel_tanh_f32_nd;
- cl_kernel kernel_tanh_f16_nd;
+ cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
+ cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
cl_kernel kernel_expm1_f32_nd;
cl_kernel kernel_expm1_f16_nd;
cl_kernel kernel_softplus_f32_nd;
cl_kernel kernel_softplus_f16_nd;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
- cl_kernel kernel_concat_f32_contiguous;
- cl_kernel kernel_concat_f32_non_contiguous;
+ cl_kernel kernel_concat_f32;
cl_kernel kernel_conv_2d_f16;
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
#else
const std::string kernel_src = read_file("scale.cl");
#endif
- backend_ctx->program_scale =
+ cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
+ CL_CHECK((backend_ctx->kernel_scale_f32 = clCreateKernel(prog, "kernel_scale_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
#else
const std::string kernel_src = read_file("repeat.cl");
#endif
- if (!kernel_src.empty()) {
- backend_ctx->program_repeat =
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
- GGML_LOG_CONT(".");
- } else {
- GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
- backend_ctx->program_repeat = nullptr;
- backend_ctx->kernel_repeat = nullptr;
- }
+ cl_program prog =
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
+ GGML_LOG_CONT(".");
}
// pad
#else
const std::string kernel_src = read_file("tanh.cl");
#endif
- if (!kernel_src.empty()) {
- backend_ctx->program_tanh =
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
- CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
- GGML_LOG_CONT(".");
- } else {
- GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
- backend_ctx->program_tanh = nullptr;
- backend_ctx->kernel_tanh_f32_nd = nullptr;
- backend_ctx->kernel_tanh_f16_nd = nullptr;
- }
+ cl_program prog =
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_tanh_f32 = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_tanh_f32_4 = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
+ CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
+ CL_CHECK((backend_ctx->kernel_tanh_f16 = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_tanh_f16_4 = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
+ CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
+ GGML_LOG_CONT(".");
}
// expm1
#include "concat.cl.h"
};
#else
-
const std::string kernel_src = read_file("concat.cl");
#endif
- if (!kernel_src.empty()) {
- backend_ctx->program_concat =
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
- CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
- CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
- GGML_LOG_CONT(".");
- } else {
- GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
- backend_ctx->program_concat = nullptr;
- backend_ctx->kernel_concat_f32_contiguous = nullptr;
- backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
- }
+ cl_program prog =
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
+ GGML_LOG_CONT(".");
}
// timestep_embedding
case GGML_UNARY_OP_SIGMOID:
return ggml_is_contiguous(op->src[0]);
case GGML_UNARY_OP_TANH:
- return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
- (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+ return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
case GGML_UNARY_OP_EXPM1:
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
- cl_ulong offset0_abs = extra0->offset + src0->view_offs;
- cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
- cl_kernel kernel;
- if (dst->type == GGML_TYPE_F32) {
- kernel = backend_ctx->kernel_tanh_f32_nd;
- } else if (dst->type == GGML_TYPE_F16) {
- kernel = backend_ctx->kernel_tanh_f16_nd;
- } else {
- GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
- }
- GGML_ASSERT(kernel != nullptr);
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
- const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
- const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
+ const cl_ulong nb00 = src0->nb[0];
+ const cl_ulong nb01 = src0->nb[1];
+ const cl_ulong nb02 = src0->nb[2];
+ const cl_ulong nb03 = src0->nb[3];
- const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
- const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
+ const cl_ulong nb0 = dst->nb[0];
+ const cl_ulong nb1 = dst->nb[1];
+ const cl_ulong nb2 = dst->nb[2];
+ const cl_ulong nb3 = dst->nb[3];
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
+ cl_kernel kernel;
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
+ if (ggml_is_contiguous(src0)) {
+ // Handle contiguous input
+ int n = ggml_nelements(dst);
+ if (n % 4 == 0) {
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_tanh_f32_4;
+ } else {
+ kernel = backend_ctx->kernel_tanh_f16_4;
+ }
+ n /= 4;
+ } else {
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_tanh_f32;
+ } else {
+ kernel = backend_ctx->kernel_tanh_f16;
+ }
+ }
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
- size_t global_work_size[3];
- if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
- return;
- }
- global_work_size[0] = (size_t)ne10;
- global_work_size[1] = (size_t)ne11;
- global_work_size[2] = (size_t)ne12;
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
- size_t lws0 = 16, lws1 = 4, lws2 = 1;
- if (ne10 < 16) lws0 = ne10;
- if (ne11 < 4) lws1 = ne11;
- if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr;
+ }
- while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
- while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
- while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+ } else {
+ // Handle non-contiguous input
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_tanh_f32_nc;
+ } else {
+ kernel = backend_ctx->kernel_tanh_f16_nc;
+ }
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
+
+ int nth = 64;
- size_t local_work_size[] = {lws0, lws1, lws2};
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
- size_t* local_work_size_ptr = local_work_size;
- if (!backend_ctx->non_uniform_workgroups) {
- if (global_work_size[0] % local_work_size[0] != 0 ||
- global_work_size[1] % local_work_size[1] != 0 ||
- global_work_size[2] % local_work_size[2] != 0) {
- local_work_size_ptr = NULL;
- }
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
- if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
-
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
- if (backend_ctx->kernel_repeat == nullptr) {
- GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
- return;
- }
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
- ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
- ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
- cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
- cl_ulong off_dst = extra_dst->offset + dst->view_offs;
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
+
+ const cl_ulong nb00 = src0->nb[0];
+ const cl_ulong nb01 = src0->nb[1];
+ const cl_ulong nb02 = src0->nb[2];
+ const cl_ulong nb03 = src0->nb[3];
+
+ const int ne0 = dst->ne[0];
+ const int ne1 = dst->ne[1];
+ const int ne2 = dst->ne[2];
+ const int ne3 = dst->ne[3];
- const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
- const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
+ const cl_ulong nb0 = dst->nb[0];
+ const cl_ulong nb1 = dst->nb[1];
+ const cl_ulong nb2 = dst->nb[2];
+ const cl_ulong nb3 = dst->nb[3];
- const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
- const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
+ cl_kernel kernel = backend_ctx->kernel_repeat_f32;
- cl_kernel kernel = backend_ctx->kernel_repeat;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
-
- size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
- size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
- size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
-
- size_t global_work_size[] = { gws0, gws1, gws2 };
+ int nth = 64;
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
- cl_command_queue queue = backend_ctx->queue;
- if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
- GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
- return;
- }
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
- ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
- ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
- ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
- cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
- cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
- cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
- const int32_t dim = ((const int32_t *) dst->op_params)[0];
+ const cl_ulong nb00 = src0->nb[0];
+ const cl_ulong nb01 = src0->nb[1];
+ const cl_ulong nb02 = src0->nb[2];
+ const cl_ulong nb03 = src0->nb[3];
+
+ const cl_ulong nb10 = src1->nb[0];
+ const cl_ulong nb11 = src1->nb[1];
+ const cl_ulong nb12 = src1->nb[2];
+ const cl_ulong nb13 = src1->nb[3];
+
+ const int ne0 = dst->ne[0];
+ const int ne1 = dst->ne[1];
+ const int ne2 = dst->ne[2];
+ const int ne3 = dst->ne[3];
+
+ const cl_ulong nb0 = dst->nb[0];
+ const cl_ulong nb1 = dst->nb[1];
+ const cl_ulong nb2 = dst->nb[2];
+ const cl_ulong nb3 = dst->nb[3];
+
+ const cl_int dim = ((const int32_t *) dst->op_params)[0];
GGML_ASSERT(dim >= 0 && dim <= 3);
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
- if (dim == 3) {
+ int nth = MIN(64, ne0);
- size_t nbytes_src0 = ggml_nbytes(src0);
- size_t nbytes_src1 = ggml_nbytes(src1);
+ cl_kernel kernel = backend_ctx->kernel_concat_f32;
- CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
- off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
- CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
- off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
- } else {
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
+
+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
- cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
- size_t global_work_size[3];
-
- for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
- cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
- cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
- cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
-
- int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
- int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
- int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
-
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
-
- global_work_size[0] = d_ne0;
- global_work_size[1] = d_ne1;
- global_work_size[2] = d_ne2;
-
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
- }
- }
- } else {
- cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
-
- cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
- cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
-
- cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
-
- cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
- cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
-
-
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
-
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
-
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
-
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
- CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
- CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
- CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
- CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
- CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
- CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
- CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
-
- size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
- d_ne2 > 0 ? (size_t)d_ne2 : 1,
- d_ne3 > 0 ? (size_t)d_ne3 : 1 };
-
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
- }
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
CL_CHECK(clReleaseMemObject(D_sub_buffer));
CL_CHECK(clReleaseMemObject(D_image1d));
#else
+ GGML_UNUSED(backend);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
- cl_kernel kernel = backend_ctx->kernel_scale;
+ cl_kernel kernel;
+
+ int n = ggml_nelements(dst);
+
+ if (n % 4 == 0) {
+ kernel = backend_ctx->kernel_scale_f32_4;
+ n /= 4;
+ } else {
+ kernel = backend_ctx->kernel_scale_f32;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
- int n = ggml_nelements(dst)/4;
-
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};