cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
+ cl_kernel kernel_restore_block_q4_0_noshuffle;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
cl_kernel kernel_transpose_32;
cl_kernel kernel_transpose_32_16;
cl_kernel kernel_transpose_16;
+ cl_kernel kernel_transpose_16_buf;
cl_kernel kernel_transpose_16_4x1;
cl_mem A_s_d_max; // max scale buffer size for transpose
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
- CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
+ CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
+ CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
GGML_LOG_CONT(".");
}
if (tensor->type == GGML_TYPE_Q4_0) {
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ if (use_adreno_kernels(backend_ctx, tensor)) {
+ cl_int err;
+ cl_kernel kernel;
+
+ cl_int M = tensor->ne[1]; // ne01
+ cl_int K = tensor->ne[0]; // ne00
+
+ GGML_ASSERT(K % 32 == 0);
+ GGML_ASSERT(M % 4 == 0);
+
+ size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
+ size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
+ GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+ cl_mem buf_trans_q;
+ cl_mem buf_trans_d;
+
+ CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ size_q, NULL, &err), err));
+ CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ size_d, NULL, &err), err));
+
+ kernel = backend_ctx->kernel_transpose_16_buf;
+
+ // transpose q back
+ cl_int stride_k_q = K/4;
+ size_t local_size_q[3] = {64, 1, 1};
+ size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
+
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+ global_size_q, local_size_q, 0, NULL, NULL));
+
+ // transpose scales back
+ cl_int stride_k_d = K/32;
+ size_t local_size_d[3] = {64, 1, 1};
+ size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
+
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+ global_size_d, local_size_d, 0, NULL, NULL));
+
+ // unpack
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ ggml_nbytes(tensor), NULL, &err);
+ CL_CHECK(err);
+
+ cl_uchar mask_0F = 0x0F;
+ cl_uchar mask_F0 = 0xF0;
+
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+ size_t local_work_size[] = {1, 1, 1};
+
+ kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
+
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+ global_work_size, local_work_size, 0, NULL, NULL));
+
+ // read back to host
+ CL_CHECK(clEnqueueReadBuffer(
+ queue, data_device, CL_TRUE, offset,
+ size, data, 0, NULL, NULL));
+
+ CL_CHECK(clReleaseMemObject(data_device));
+ CL_CHECK(clReleaseMemObject(buf_trans_q));
+ CL_CHECK(clReleaseMemObject(buf_trans_d));
+
+ return;
+ }
+#endif
+
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);