cl_program program_mul_mm_f32_f32_l4_lm;
cl_program program_mul_mm_f16_f32_l4_lm;
- cl_kernel kernel_add, kernel_add_row;
- cl_kernel kernel_mul, kernel_mul_row;
- cl_kernel kernel_div, kernel_div_row;
- cl_kernel kernel_sub, kernel_sub_row;
+ cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
+ cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
+ cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
+ cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
cl_kernel kernel_scale;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
backend_ctx->program_add =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
- CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
GGML_LOG_CONT(".");
}
backend_ctx->program_mul =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
- CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
GGML_LOG_CONT(".");
}
#else
const std::string kernel_src = read_file("div.cl");
#endif
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
+ " -cl-mad-enable -cl-finite-math-only ";
+
backend_ctx->program_div =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
- CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
+ CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
GGML_LOG_CONT(".");
}
backend_ctx->program_sub =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
- CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
- CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
+ CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
GGML_LOG_CONT(".");
}
default:
return false;
}
- case GGML_OP_ADD:
case GGML_OP_SCALE:
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
+ case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SUB:
- return op->src[0]->type == GGML_TYPE_F32;
+ return (op->src[0]->type == op->src[1]->type) &&
+ (op->src[0]->type == op->type) &&
+ (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
- const int ne00 = src0 ? src0->ne[0] : 0;
- const int ne01 = src0 ? src0->ne[1] : 0;
- const int ne02 = src0 ? src0->ne[2] : 0;
- const int ne03 = src0 ? src0->ne[3] : 0;
+ GGML_ASSERT(src0->type == src1->type);
+ GGML_ASSERT(src0->type == dst->type);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
- const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
- const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
- const int ne10 = src1 ? src1->ne[0] : 0;
- const int ne11 = src1 ? src1->ne[1] : 0;
- const int ne12 = src1 ? src1->ne[2] : 0;
- const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+ const cl_ulong nb00 = src0->nb[0];
+ const cl_ulong nb01 = src0->nb[1];
+ const cl_ulong nb02 = src0->nb[2];
+ const cl_ulong nb03 = src0->nb[3];
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
- const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
- const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+ const int ne10 = src1->ne[0];
+ const int ne11 = src1->ne[1];
+ const int ne12 = src1->ne[2];
+ const int ne13 = src1->ne[3]; UNUSED(ne13);
- const int ne0 = dst ? dst->ne[0] : 0;
- const int ne1 = dst ? dst->ne[1] : 0;
- const int ne2 = dst ? dst->ne[2] : 0;
- const int ne3 = dst ? dst->ne[3] : 0;
+ const cl_ulong nb10 = src1->nb[0];
+ const cl_ulong nb11 = src1->nb[1];
+ const cl_ulong nb12 = src1->nb[2];
+ const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
- const cl_ulong nb0 = dst ? dst->nb[0] : 0;
- const cl_ulong nb1 = dst ? dst->nb[1] : 0;
- const cl_ulong nb2 = dst ? dst->nb[2] : 0;
- const cl_ulong nb3 = dst ? dst->nb[3] : 0;
+ const int ne0 = dst->ne[0];
+ const int ne1 = dst->ne[1];
+ const int ne2 = dst->ne[2];
+ const int ne3 = dst->ne[3];
+
+ const cl_ulong nb0 = dst->nb[0];
+ const cl_ulong nb1 = dst->nb[1];
+ const cl_ulong nb2 = dst->nb[2];
+ const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
bcast_row = true;
int ne = ne00 / 4;
- kernel = backend_ctx->kernel_add_row;
+
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_add_row;
+ } else {
+ kernel = backend_ctx->kernel_add_row_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
- kernel = backend_ctx->kernel_add;
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_add;
+ } else {
+ kernel = backend_ctx->kernel_add_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
- const int ne00 = src0 ? src0->ne[0] : 0;
- const int ne01 = src0 ? src0->ne[1] : 0;
- const int ne02 = src0 ? src0->ne[2] : 0;
- const int ne03 = src0 ? src0->ne[3] : 0;
+ GGML_ASSERT(src0->type == src1->type);
+ GGML_ASSERT(src0->type == dst->type);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
- const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
- const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
- const int ne10 = src1 ? src1->ne[0] : 0;
- const int ne11 = src1 ? src1->ne[1] : 0;
- const int ne12 = src1 ? src1->ne[2] : 0;
- const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+ const cl_ulong nb00 = src0->nb[0];
+ const cl_ulong nb01 = src0->nb[1];
+ const cl_ulong nb02 = src0->nb[2];
+ const cl_ulong nb03 = src0->nb[3];
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
- const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
- const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+ const int ne10 = src1->ne[0];
+ const int ne11 = src1->ne[1];
+ const int ne12 = src1->ne[2];
+ const int ne13 = src1->ne[3]; UNUSED(ne13);
+
+ const cl_ulong nb10 = src1->nb[0];
+ const cl_ulong nb11 = src1->nb[1];
+ const cl_ulong nb12 = src1->nb[2];
+ const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
- const int ne0 = dst ? dst->ne[0] : 0;
- const int ne1 = dst ? dst->ne[1] : 0;
- const int ne2 = dst ? dst->ne[2] : 0;
- const int ne3 = dst ? dst->ne[3] : 0;
+ const int ne0 = dst->ne[0];
+ const int ne1 = dst->ne[1];
+ const int ne2 = dst->ne[2];
+ const int ne3 = dst->ne[3];
- const cl_ulong nb0 = dst ? dst->nb[0] : 0;
- const cl_ulong nb1 = dst ? dst->nb[1] : 0;
- const cl_ulong nb2 = dst ? dst->nb[2] : 0;
- const cl_ulong nb3 = dst ? dst->nb[3] : 0;
+ const cl_ulong nb0 = dst->nb[0];
+ const cl_ulong nb1 = dst->nb[1];
+ const cl_ulong nb2 = dst->nb[2];
+ const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
bcast_row = true;
int ne = ne00 / 4;
- kernel = backend_ctx->kernel_mul_row;
+
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_mul_row;
+ } else {
+ kernel = backend_ctx->kernel_mul_row_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
- kernel = backend_ctx->kernel_mul;
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_mul;
+ } else {
+ kernel = backend_ctx->kernel_mul_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
+ GGML_ASSERT(src0->type == src1->type);
+ GGML_ASSERT(src0->type == dst->type);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
bcast_row = true;
int ne = ne00 / 4;
- kernel = backend_ctx->kernel_div_row;
+
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_div_row;
+ } else {
+ kernel = backend_ctx->kernel_div_row_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
- kernel = backend_ctx->kernel_div;
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_div;
+ } else {
+ kernel = backend_ctx->kernel_div_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
+ GGML_ASSERT(src0->type == src1->type);
+ GGML_ASSERT(src0->type == dst->type);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
bcast_row = true;
int ne = ne00 / 4;
- kernel = backend_ctx->kernel_sub_row;
+
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_sub_row;
+ } else {
+ kernel = backend_ctx->kernel_sub_row_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
- kernel = backend_ctx->kernel_sub;
+ if (src0->type == GGML_TYPE_F32) {
+ kernel = backend_ctx->kernel_sub;
+ } else {
+ kernel = backend_ctx->kernel_sub_f16;
+ }
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));