return { type, major, minor, patch };
}
+// cl buffer wrapper
+struct ggml_cl_buffer {
+ cl_mem buffer;
+ size_t size;
+
+ ggml_cl_buffer()
+ : buffer(nullptr), size(0) {}
+
+ ~ggml_cl_buffer() {
+ if (buffer) {
+ CL_CHECK(clReleaseMemObject(buffer));
+ }
+ }
+
+ void allocate(cl_context context, size_t new_size) {
+ if (new_size > size) {
+ size = new_size;
+ if (buffer) {
+ CL_CHECK(clReleaseMemObject(buffer));
+ }
+ cl_int err;
+ CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
+ }
+ }
+};
+
// Profiling
struct ProfilingInfo {
std::string op_name;
cl_context context;
cl_command_queue queue;
+ // prealloc buffers for transposing weights and activations
+ ggml_cl_buffer prealloc_quant_trans;
+ ggml_cl_buffer prealloc_scales_trans;
+ ggml_cl_buffer prealloc_act_trans;
+
cl_program program_add;
cl_program program_add_id;
cl_program program_clamp;
cl_kernel kernel_transpose_16_buf;
cl_kernel kernel_transpose_16_4x1;
- cl_mem A_s_d_max; // max scale buffer size for transpose
- cl_mem A_q_d_max; // max weight buffer size for transpose
- cl_mem B_d_max; // max activation buffer size for transpose
-
// Gemm and Gemv related programs, kernels, etc
cl_program program_CL_gemm;
cl_program program_CL_gemv_general;
required_B_d_bytes, max_B_d_bytes);
}
- CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
- CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
- CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
+ backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
+ backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
+ backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
// use sub_buffer of max buffer size instead
size_t q_size_bytes = K * M / 8 * sizeof(float);
+ backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
+
cl_buffer_region region;
region.origin = 0;
region.size = q_size_bytes;
cl_mem qT_d = clCreateSubBuffer(
- backend_ctx->A_q_d_max,
+ backend_ctx->prealloc_quant_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
®ion,
&err);
- // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
CL_CHECK(err);
bool K_tile_trans = true;
if ((K / 32) % 4 != 0){
K_tile_trans =false;
}
+
size_t d_size_bytes = M * (K / 32) * 2;
+ backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
+
region.origin = 0;
region.size = d_size_bytes;
cl_mem dT_d = clCreateSubBuffer(
- backend_ctx->A_s_d_max,
+ backend_ctx->prealloc_scales_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
®ion,
&err);
- // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
CL_CHECK(err);
// <----------------------------------------------------------------------------------> //
region.origin = 0;
// Specify the size of the sub-buffer (divide by 2 for FP16)
region.size = K * (N + padding) * sizeof(float)/2;
+ backend_ctx->prealloc_act_trans.allocate(context, region.size);
+
B_d = clCreateSubBuffer(
- backend_ctx->B_d_max,
+ backend_ctx->prealloc_act_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
®ion,