cl_uint minor = 0;
};
+
struct ggml_cl_compiler_version {
ADRENO_CL_COMPILER_TYPE type;
int major = -1;
}
};
+static size_t align_to(size_t value, size_t to_alignment) {
+ GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
+ GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
+
+ return ((value + to_alignment - 1) / to_alignment) * to_alignment;
+}
+
+
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
static ggml_cl_version parse_cl_version(std::string_view str) {
size_t major_str_begin = 0;
int adreno_wave_size;
+ cl_bool non_uniform_workgroups;
+
cl_context context;
cl_command_queue queue;
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+ if (opencl_c_version.major >= 3) {
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
+ &backend_ctx->non_uniform_workgroups, 0));
+ } else {
+ GGML_ASSERT(opencl_c_version.major == 2);
+ // Non-uniform workgroup sizes is mandatory feature in v2.x.
+ backend_ctx->non_uniform_workgroups = true;
+ }
+
// Print out configurations
#ifdef GGML_OPENCL_SOA_Q
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
// The original tensor memory is divided into scales and quants, i.e.,
// we first store scales, then quants.
// Create subbuffer for scales.
- region.origin = extra_orig->offset + tensor->view_offs + offset;
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
region.size = size_d;
extra->d = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
CL_CHECK(err);
+ auto previous_origin = region.origin;
// Create subbuffer for quants.
- region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+ region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
region.size = size_q;
extra->q = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
size_t local_work_size[] = {64, 1, 1};
+ size_t * local_work_size_ptr = local_work_size;
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
+ }
+
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
}