GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
- const int ne00 = src0->ne[0];
- const cl_ulong nb01 = src0->nb[1];
- const cl_ulong nb02 = src0->nb[2];
- const cl_ulong nb03 = src0->nb[3];
- const int ne10 = src1->ne[0];
- const cl_ulong nb10 = src1->nb[0];
- const int ne11 = src1->ne[1];
- const int ne12 = src1->ne[2];
- const cl_ulong nb11 = src1->nb[1];
- const cl_ulong nb12 = src1->nb[2];
- const cl_ulong nb1 = dst->nb[1];
- const cl_ulong nb2 = dst->nb[2];
- const cl_ulong nb3 = dst->nb[3];
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+ GGML_TENSOR_LOCALS(int, ne1, src1, ne);
+ GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
- size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
- size_t local_work_size[] = {64, 1, 1};
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
+ int nth = 1;
+ while (nth < ne00 && 2*nth <= max_workgroup_size) {
+ nth *= 2;
+ }
+
+ size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}