kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
cl_kernel kernel_norm, kernel_norm_mul_add;
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
+ cl_kernel kernel_l2_norm_f32;
cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
cl_kernel kernel_diag_f32;
GGML_LOG_CONT(".");
}
+ // l2_norm
+ {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src {
+ #include "l2_norm.cl.h"
+ };
+#else
+ const std::string kernel_src = read_file("l2_norm.cl");
+#endif
+ cl_program prog =
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+ CL_CHECK((backend_ctx->kernel_l2_norm_f32 = clCreateKernel(prog, "kernel_l2_norm_f32", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
+ GGML_LOG_CONT(".");
+ }
+
// rope
{
#ifdef GGML_OPENCL_EMBED_KERNELS
return true;
case GGML_OP_RMS_NORM:
return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
+ case GGML_OP_L2_NORM:
+ return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_REPEAT:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
case GGML_OP_PAD:
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
+static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
+
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+
+ size_t sgs;
+ if (backend_ctx->gpu_family == ADRENO) {
+ sgs = 64;
+ } else if (backend_ctx->gpu_family == INTEL) {
+ sgs = 32;
+ } else {
+ GGML_ASSERT(false && "Unsupported GPU");
+ }
+
+ cl_kernel kernel = backend_ctx->kernel_l2_norm_f32;
+
+ int nth = sgs;
+ while (nth < ne00 && nth < (int)backend_ctx->get_kernel_workgroup_size(kernel)) {
+ nth *= 2;
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
+
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+}
+
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
}
func = ggml_cl_rms_norm;
break;
+ case GGML_OP_L2_NORM:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_l2_norm;
+ break;
case GGML_OP_GROUP_NORM:
if (!any_on_device) {
return false;
--- /dev/null
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_32
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_l2_norm_f32(
+ global void * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ float eps,
+ local float * sum
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
+ global float * y = (global float *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+ float sumf = 0;
+
+ // parallel sum
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ sumf += x[i00] * x[i00];
+ }
+ sumf = sub_group_reduce_add(sumf);
+
+ if (get_sub_group_local_id() == 0) {
+ sum[get_sub_group_id()] = sumf;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // broadcast
+ for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+ if (get_local_id(0) < i) {
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const float scale = 1.0f/sqrt(max(sum[0], eps));
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ y[i00] = x[i00] * scale;
+ }
+}