const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
+ const int64_t r2 = ne12 / ne02;
+ const int64_t r3 = ne13 / ne03;
+
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
+ int64_t pi02 = -1;
+ int64_t pi03 = -1;
+
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ int64_t i03 = i13 / r3;
+
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ int64_t i02 = i12 / r2;
+
// copy data to device
- if (src0->backend != GGML_BACKEND_GPU) {
+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+ pi02 = i02;
+ pi03 = i03;
}
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
CL_CHECK(clFinish(queue));
}
// copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
}
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
const int nb10 = src1->nb[0];
const int nb11 = src1->nb[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
+ const int64_t r2 = ne12 / ne02;
+ const int64_t r3 = ne13 / ne03;
+
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
const int x_ne = ne01 * ne00;
bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
+ int64_t pi02 = -1;
+ int64_t pi03 = -1;
+
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ int64_t i03 = i13 / r3;
+
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ int64_t i02 = i12 / r2;
+
// copy src0 to device
- if (src0->backend != GGML_BACKEND_GPU) {
+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+ pi02 = i02;
+ pi03 = i03;
}
// convert src1 to fp16
// TODO: use multiple threads
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
if (src1_cont_rows) {
if (src1_cont_cols) {
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
}
else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
}
}
}
else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- for (int64_t i00 = 0; i00 < ne10; i00++) {
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
// very slow due to no inlining
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
}
}
}
// copy dst to host, then convert to float
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
}
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const ggml_type type = src0->type;
const bool mul_mat_vec = ne11 == 1;
+ const int64_t r2 = ne12 / ne02;
+ const int64_t r3 = ne13 / ne03;
+
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
size_t ev_idx = 0;
std::vector<cl_event> events;
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
+ int64_t pi02 = -1;
+ int64_t pi03 = -1;
+
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ int64_t i03 = i13 / r3;
+
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ int64_t i02 = i12 / r2;
+
// copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) {
- events.emplace_back();
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+ if (i02 != pi02 || i03 != pi03) {
+ events.emplace_back();
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+ pi02 = i02;
+ pi03 = i03;
+ }
} else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->extra;
} else {
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device
events.emplace_back();
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
// compute
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
// copy src1 to device
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
events.emplace_back();
}
// copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
for (auto *event : events) {
clReleaseEvent(event);