bool fp16_support;
bool has_vector_subgroup_broadcast;
bool disable_fusion;
+
+ bool adreno_has_large_buffer;
+ bool adreno_use_large_buffer;
ggml_cl_compiler_version adreno_cl_compiler_version;
int adreno_wave_size;
" -cl-mad-enable -cl-unsafe-math-optimizations"
" -cl-finite-math-only -cl-fast-relaxed-math";
+ if (backend_ctx->adreno_use_large_buffer) {
+ compile_opts += " -qcom-enable-large-buffer ";
+ }
+
GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
// add
// Check if ext_buffer contains cl_khr_fp16
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+ // check Adreno large buffer support
+ backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
// fp16 is required
if (!backend_ctx->fp16_support) {
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+ // determine whether to use large buffer for Adreno
+ backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
+ backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
+ if (backend_ctx->adreno_use_large_buffer) {
+ if (!backend_ctx->adreno_has_large_buffer) {
+ GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
+ backend_ctx->adreno_use_large_buffer = false;
+ } else {
+ GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
+ }
+ }
+
cl_int err;
// A local ref of cl_context for convenience
cl_int err;
cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
+ if (err != CL_SUCCESS && backend_ctx->adreno_use_large_buffer) {
+ cl_mem_properties props[] = { 0x41A6 /* CL_LARGE_BUFFER_QCOM */, 1, 0 };
+ mem = clCreateBufferWithProperties(backend_ctx->context, props, CL_MEM_READ_WRITE, size, NULL, &err);
+ }
+
if (err != CL_SUCCESS) {
GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
return nullptr;