mul_mv_q4_0_f32_8x_flat
mul_mv_q4_0_f32_1d_8x_flat
mul_mv_q4_0_f32_1d_16x_flat
- mul_mv_q6_k
+ mul_mv_q6_k_f32
+ mul_mv_q6_k_f32_flat
mul_mv_q8_0_f32
mul_mv_q8_0_f32_flat
mul_mv_mxfp4_f32
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
+ cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
+ cl_kernel kernel_mul_mv_q6_K_f32_flat;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
cl_kernel kernel_solve_tri_f32;
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
+ CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
+ CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
GGML_LOG_CONT(".");
}
GGML_LOG_CONT(".");
}
- // mul_mv_q6_k
+ // mul_mv_q6_k_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
- #include "mul_mv_q6_k.cl.h"
+ #include "mul_mv_q6_k_f32.cl.h"
};
#else
- const std::string kernel_src = read_file("mul_mv_q6_k.cl");
+ const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
#endif
backend_ctx->program_mul_mv_q6_K =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
GGML_LOG_CONT(".");
}
+ // mul_mv_q6_k_f32_flat
+ {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src {
+ #include "mul_mv_q6_k_f32_flat.cl.h"
+ };
+#else
+ const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
+#endif
+ cl_program prog =
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
+ CL_CHECK(clReleaseProgram(prog));
+ GGML_LOG_CONT(".");
+ }
+
// mul_mv_q8_0_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
}
};
+struct ggml_tensor_extra_cl_q6_K {
+ // Lower 4 bits of quantized weights.
+ cl_mem ql = nullptr;
+ // Upper 2 bits of quantized weights.
+ cl_mem qh = nullptr;
+ // Scales for each block.
+ cl_mem s = nullptr;
+ // Scales for each super block.
+ cl_mem d = nullptr;
+
+ size_t size_ql = 0;
+ size_t size_qh = 0;
+ size_t size_s = 0;
+ size_t size_d = 0;
+
+ ~ggml_tensor_extra_cl_q6_K() {
+ reset();
+ }
+
+ void reset() {
+ if (ql != nullptr) {
+ CL_CHECK(clReleaseMemObject(ql));
+ ql = nullptr;
+ }
+ if (qh != nullptr) {
+ CL_CHECK(clReleaseMemObject(qh));
+ qh = nullptr;
+ }
+ if (s != nullptr) {
+ CL_CHECK(clReleaseMemObject(s));
+ s = nullptr;
+ }
+ if (d != nullptr) {
+ CL_CHECK(clReleaseMemObject(d));
+ d = nullptr;
+ }
+
+ size_ql = 0;
+ size_qh = 0;
+ size_s = 0;
+ size_d = 0;
+ }
+};
+
//------------------------------------------------------------------------------
// Backend API
//------------------------------------------------------------------------------
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
delete e;
}
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
+ delete e;
+ }
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
+ delete e;
+ }
}
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
return extra;
}
+ ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
+ ggml_tensor_extra_cl_q6_K * extra;
+ if (temp_tensor_extras_q6_K.empty()) {
+ extra = new ggml_tensor_extra_cl_q6_K();
+ } else {
+ extra = temp_tensor_extras_q6_K.back();
+ temp_tensor_extras_q6_K.pop_back();
+ }
+
+ temp_tensor_extras_q6_K_in_use.push_back(extra);
+
+ extra->reset();
+ return extra;
+ }
+
void reset() {
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
temp_tensor_extras.push_back(e);
temp_tensor_extras_q8_0.push_back(e);
}
temp_tensor_extras_q8_0_in_use.clear();
+
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
+ temp_tensor_extras_q6_K.push_back(e);
+ }
+ temp_tensor_extras_q6_K_in_use.clear();
}
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
+ std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
+ std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
// before any tensor is initialized (at the beginning of alloc_tensor_range).
return;
}
+ if (tensor->type == GGML_TYPE_Q6_K) {
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+ // Allocate the new extra and create aliases from the original.
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+ ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
+
+ size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
+ size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
+ size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+ GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
+ "Incorrect tensor size");
+
+ cl_int err;
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ ggml_nbytes(tensor), NULL, &err);
+ CL_CHECK(err);
+ CL_CHECK(clEnqueueWriteBuffer(
+ queue, data_device, CL_TRUE, 0,
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+ cl_buffer_region region;
+
+ // Subbuffer for ql
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
+ region.size = size_ql;
+ extra->ql = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+ auto previous_origin = region.origin;
+
+ // Subbuffer for qh
+ region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
+ region.size = size_qh;
+ extra->qh = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+ previous_origin = region.origin;
+
+ // Subbuffer for scales
+ region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
+ region.size = size_s;
+ extra->s = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+ previous_origin = region.origin;
+
+ // Create subbuffer for d.
+ region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
+ region.size = size_d;
+ extra->d = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+ previous_origin = region.origin;
+
+ // Flatten the weights
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
+
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ CL_CHECK(clReleaseMemObject(data_device));
+
+ extra->size_ql = size_ql;
+ extra->size_qh = size_qh;
+ extra->size_s = size_s;
+ extra->size_d = size_d;
+
+ tensor->extra = extra;
+ return;
+ }
#endif // GGML_OPENCL_SOA_Q
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+ global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ CL_CHECK(clEnqueueReadBuffer(
+ queue, data_device, CL_TRUE, offset,
+ size, data, 0, NULL, NULL));
+ CL_CHECK(clReleaseMemObject(data_device));
+ return;
+ }
+ if (tensor->type == GGML_TYPE_Q6_K) {
+ ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
+
+ cl_int err;
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ ggml_nbytes(tensor), NULL, &err);
+ CL_CHECK(err);
+
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
+
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+ size_t local_work_size[] = {1, 1, 1};
+
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
+ ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
#endif
const int ne00 = src0 ? src0->ne[0] : 0;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
+#ifdef GGML_OPENCL_SOA_Q
+ kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
+
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 16;
+ nth1 = 2;
+ ndst = 4;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 2;
+ ndst = 4;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
+#else
kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
if (backend_ctx->gpu_family == INTEL) {
- nth0 = 2;
- nth1 = 16;
+ nth0 = 16;
+ nth1 = 2;
+ ndst = 1;
} else if (backend_ctx->gpu_family == ADRENO) {
- nth0 = 2;
- nth1 = 64;
+ nth0 = 64;
+ nth1 = 2;
+ ndst = 1;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
+#endif // GGML_OPENCL_SOA_Q
break;
case GGML_TYPE_MXFP4: {
#ifdef GGML_OPENCL_SOA_Q
} else if (src0t == GGML_TYPE_Q5_K) {
GGML_ASSERT(false && "not implemented");
} else if (src0t == GGML_TYPE_Q6_K) {
- size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
uint8_t qs[QK4_0 / 2];
};
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+struct block_q6_K {
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
+ half d; // super-block scale
+};
+
//------------------------------------------------------------------------------
// kernel_convert_block_q4_0
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
b->qs[i] = q[i];
}
}
+
+//------------------------------------------------------------------------------
+// kernel_convert_block_q6_K
+// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+// Each thread processes a super block.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q6_K(
+ global struct block_q6_K * src0,
+ global uchar * dst_ql,
+ global uchar * dst_qh,
+ global char * dst_s,
+ global half * dst_d
+) {
+ global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
+ global half * d = (global half *) dst_d + get_global_id(0);
+
+ *d = b->d;
+
+ for (int i = 0; i < QK_K/2; ++i) {
+ ql[i] = b->ql[i];
+ }
+ for (int i = 0; i < QK_K/4; ++i) {
+ qh[i] = b->qh[i];
+ }
+ for (int i = 0; i < QK_K/16; ++i) {
+ s[i] = b->scales[i];
+ }
+}
+
+// Restore block_q6_K from flattened arrays.
+// Each thread processes a super block.
+kernel void kernel_restore_block_q6_K(
+ global uchar * dst_ql,
+ global uchar * dst_qh,
+ global char * dst_s,
+ global half * dst_d,
+ global struct block_q6_K * dst
+) {
+ global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
+ global half * d = (global half *) dst_d + get_global_id(0);
+
+ b->d = *d;
+
+ for (int i = 0; i < QK_K/2; ++i) {
+ b->ql[i] = ql[i];
+ }
+ for (int i = 0; i < QK_K/4; ++i) {
+ b->qh[i] = qh[i];
+ }
+ for (int i = 0; i < QK_K/16; ++i) {
+ b->scales[i] = s[i];
+ }
+}
+++ /dev/null
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-#define QR4_1 2
-#define QK5_0 32
-#define QR5_0 2
-#define QK5_1 32
-#define QR5_1 2
-#define QK8_0 32
-#define QR8_0 1
-#define QK_K 256
-#define K_QUANTS_PER_ITERATION 2
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-//------------------------------------------------------------------------------
-// block_q6_K
-//------------------------------------------------------------------------------
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
- half d; // super-block scale
-} block_q6_K;
-
-//------------------------------------------------------------------------------
-// kernel_mul_mv_q6_K_f32
-//------------------------------------------------------------------------------
-
-#undef N_DST
-#undef N_SIMDGROUP
-#undef N_SIMDWIDTH
-
-#ifdef INTEL_GPU
-#define N_DST 1 // number of rows each SIMD group works on
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 16 // SIMD group size
-#elif defined (ADRENO_GPU)
-#define N_DST 1
-#define N_SIMDGROUP 2
-#define N_SIMDWIDTH 64
-#endif
-
-#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
-
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_16
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_mul_mv_q6_K_f32(
- global void * src0,
- ulong offset0,
- global float * src1,
- ulong offset1,
- global float * dst,
- ulong offsetd,
- int ne00,
- int ne01,
- int ne02,
- int ne10,
- int ne12,
- int ne0,
- int ne1,
- int r2,
- int r3
-) {
- src0 = (global void*)((global char*)src0 + offset0);
- src1 = (global float*)((global char*)src1 + offset1);
- dst = (global float*)((global char*)dst + offsetd);
-
- uchar kmask1 = 0x03;
- uchar kmask2 = 0x0C;
- uchar kmask3 = 0x30;
- uchar kmask4 = 0xC0;
-
- int nb = ne00/QK_K;
-
- int r0 = get_group_id(0);
- int r1 = get_group_id(1);
- int im = get_group_id(2);
-
- int row = N_SIMDGROUP * r0 + get_sub_group_id();
-
- if (row >= ne01) {
- return;
- }
-
- int i12 = im%ne12;
- int i13 = im/ne12;
-
- ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
- global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
- global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
-
- float sumf = 0;
-
- // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
- // block. Values in a subblock shares a scale that is quantized with 8 bits;
- // the entire block shares a single floating point scale.
- // For work distribution, each thread processes a subblock (16 weights), hence
- // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
- // (super) blocks -- this is the block stride.
- // The 16 threads that process a (super) block are split into 2 portions, each has
- // 8 threads; each portion works on 8 subblocks.
- // For subgroup of 16 threads, the entire subgroup works on a single (super) block
- // before moving to the next (super) block. Thread0 - thread7 work on the
- // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
- // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
- // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
- // works on a total of 16 weight values.
- int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
- int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
- int ip = tid/8; // first or second half of (super) block (0 or 1)
- int il = tid%8; // each half has 8 parts, one per scale
- int n = 4; // 4 scales at a time (and 4 sums)
- int l0 = n*il; // offset into half-block, 0..28
- int is = 8*ip + l0/16; // 0, 1, 8, 9
-
- int y_offset = 128*ip + l0;
- int q_offset_l = 64*ip + l0;
- int q_offset_h = 32*ip + l0;
-
- for (int i = ix; i < nb; i += BLOCK_STRIDE) {
-
- global uint8_t * q1 = x[i].ql + q_offset_l;
- global uint8_t * q2 = q1 + QK_K/8;
- global uint8_t * qh = x[i].qh + q_offset_h;
- global int8_t * sc = x[i].scales + is;
-
- global float * y = yy + i * QK_K + y_offset;
-
- float dall = x[i].d;
-
- float4 sums = {0.f, 0.f, 0.f, 0.f};
-
- sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
- sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
- sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
- sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
-
- sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
- sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
- sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
- sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
-
- sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
- sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
- sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
- sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
-
- sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
- sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
- sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
- sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
-
- sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
- }
-
- float tot = sub_group_reduce_add(sumf);
- if (get_sub_group_local_id() == 0) {
- dst[r1*ne0 + im*ne0*ne1 + row] = tot;
- }
-}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+#define QR4_1 2
+#define QK5_0 32
+#define QR5_0 2
+#define QK5_1 32
+#define QR5_1 2
+#define QK8_0 32
+#define QR8_0 1
+#define QK_K 256
+#define K_QUANTS_PER_ITERATION 2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
+ half d; // super-block scale
+} block_q6_K;
+
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32
+//------------------------------------------------------------------------------
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 1 // number of rows each SIMD group works on
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // SIMD group size
+#elif defined (ADRENO_GPU)
+#define N_DST 1
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32(
+ global void * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ uchar kmask1 = 0x03;
+ uchar kmask2 = 0x0C;
+ uchar kmask3 = 0x30;
+ uchar kmask4 = 0xC0;
+
+ int nb = ne00/QK_K;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int row = N_SIMDGROUP * r0 + get_sub_group_id();
+
+ if (row >= ne01) {
+ return;
+ }
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+ global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
+ global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float sumf = 0;
+
+ // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
+ // block. Values in a subblock shares a scale that is quantized with 8 bits;
+ // the entire block shares a single floating point scale.
+ // For work distribution, each thread processes a subblock (16 weights), hence
+ // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
+ // (super) blocks -- this is the block stride.
+ // The 16 threads that process a (super) block are split into 2 portions, each has
+ // 8 threads; each portion works on 8 subblocks.
+ // For subgroup of 16 threads, the entire subgroup works on a single (super) block
+ // before moving to the next (super) block. Thread0 - thread7 work on the
+ // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
+ // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
+ // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
+ // works on a total of 16 weight values.
+ int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+ int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+ int ip = tid/8; // first or second half of (super) block (0 or 1)
+ int il = tid%8; // each half has 8 parts, one per scale
+ int n = 4; // 4 scales at a time (and 4 sums)
+ int l0 = n*il; // offset into half-block, 0..28
+ int is = 8*ip + l0/16; // 0, 1, 8, 9
+
+ int y_offset = 128*ip + l0;
+ int q_offset_l = 64*ip + l0;
+ int q_offset_h = 32*ip + l0;
+
+ for (int i = ix; i < nb; i += BLOCK_STRIDE) {
+
+ global uint8_t * q1 = x[i].ql + q_offset_l;
+ global uint8_t * q2 = q1 + QK_K/8;
+ global uint8_t * qh = x[i].qh + q_offset_h;
+ global int8_t * sc = x[i].scales + is;
+
+ global float * y = yy + i * QK_K + y_offset;
+
+ float dall = x[i].d;
+
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+ sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
+
+ sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+ }
+
+ float tot = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+ }
+}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32_flat
+//------------------------------------------------------------------------------
+#define Q6_K_MASK1 0x03
+#define Q6_K_MASK2 0x0C
+#define Q6_K_MASK3 0x30
+#define Q6_K_MASK4 0xC0
+
+#define QK_K 256
+
+inline float block_q_6_K_dot_y_flat(
+ global uchar * blk_ql,
+ global uchar * blk_qh,
+ global char * blk_scales,
+ global half * blk_d,
+ global float * yy,
+ int ib,
+ int ip,
+ int is,
+ int l0
+) {
+ int y_offset = 128*ip + l0;
+ int q_offset_l = 64*ip + l0;
+ int q_offset_h = 32*ip + l0;
+
+ global uchar * q1 = blk_ql + ib*128 + q_offset_l;
+ global uchar * q2 = q1 + QK_K/8;
+ global uchar * qh = blk_qh + ib*64 + q_offset_h;
+ global char * sc = blk_scales + ib*16 + is;
+
+ global float * y = yy + ib * QK_K + y_offset;
+
+ float dall = blk_d[ib];
+
+ float sumf = 0;
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+ sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
+ sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
+ sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
+ sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
+
+ sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
+ sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
+ sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
+ sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
+
+ sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
+ sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
+ sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
+ sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
+
+ sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
+ sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
+ sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
+ sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
+
+ sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+
+ return sumf;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32_flat(
+ global uchar * src0_ql,
+ global uchar * src0_qh,
+ global char * src0_s,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int nb = ne00/QK_K;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ int first_row = (N_SIMDGROUP * r0 + get_sub_group_id()) * N_DST;
+
+ ulong offset_src0 = first_row*nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ ulong offset_src0_ql = offset_src0 * 128;
+ ulong offset_src0_qh = offset_src0 * 64;
+ ulong offset_src0_s = offset_src0 * 16;
+ ulong offset_src0_d = offset_src0;
+
+ global uchar * blk_ql = (global uchar *) src0_ql + offset_src0_ql;
+ global uchar * blk_qh = (global uchar *) src0_qh + offset_src0_qh;
+ global char * blk_scales = (global char *) src0_s + offset_src0_s;
+ global half * blk_d = (global half *) src0_d + offset_src0_d;
+ global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+ int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+ int ip = tid/8; // first or second half of (super) block (0 or 1)
+ int il = tid%8; // each half has 8 parts, one per scale
+ int n = 4; // 4 scales at a time (and 4 sums)
+ int l0 = n*il; // offset into half-block, 0..28
+ int is = 8*ip + l0/16; // 0, 1, 8, 9
+
+ float4 sumf = 0;
+
+ for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
+ if (first_row + 0 < ne01) {
+ sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
+ }
+ if (first_row + 1 < ne01) {
+ sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
+ }
+ if (first_row + 2 < ne01) {
+ sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
+ }
+ if (first_row + 3 < ne01) {
+ sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
+ }
+ }
+
+ float4 tot = (float4)(
+ sub_group_reduce_add(sumf.s0),
+ sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2),
+ sub_group_reduce_add(sumf.s3)
+ );
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+ }
+}