// ggml_compute_forward_mul_mat
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
const int64_t ne10 = src1->ne[0];
#endif
const int64_t ne11 = src1->ne[1];
}
return;
}
+#elif defined(GGML_USE_CLBLAST)
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+ }
+ return;
+ }
#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-#if defined(GGML_USE_CLBLAST)
- // zT = y * xT
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
- ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne10,
- 0.0f, d, ne01,
- GGML_TYPE_F32);
-#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
-#endif
}
}
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
}
return;
}
+#elif defined(GGML_USE_CLBLAST)
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+ }
+ return;
+ }
#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));
assert(id*sizeof(float) <= params->wsize);
}
-#if defined(GGML_USE_CLBLAST)
- const float * x = wdata;
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
- // zT = y * xT
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
- ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne10,
- 0.0f, d, ne01,
- GGML_TYPE_F32);
-#else
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
-#endif
}
}
}
return;
}
+#elif defined(GGML_USE_CLBLAST)
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+ }
+ return;
+ }
#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-#if defined(GGML_USE_CLBLAST)
- const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
}
const float * x = wdata;
-#endif
-#if defined(GGML_USE_CLBLAST)
- // zT = y * xT
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
- ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne10,
- 0.0f, d, ne01,
- type);
-#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
-#endif
}
}
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
}
else
+#elif defined(GGML_USE_CLBLAST)
+ if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+ node->n_tasks = 1; // TODO: this actually is doing nothing
+ // the threads are still spinning
+ cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+ }
+ else
#endif
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
#endif
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
}
#endif
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
fprintf(fp, "%s |", node->name);
}
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
- i, node->ne[0], node->ne[1],
- GGML_OP_SYMBOL[node->op]);
+ if (node->n_dims == 2) {
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
+ } else {
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
+ }
+
if (node->grad) {
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);