#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
+static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
//const int64_t ne00 = src0->ne[0];
//const int64_t ne01 = src0->ne[1];
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+ if (ggml_compute_forward_mul_mat_use_blas(dst)) {
if (params->ith != 0) {
return;
}
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-
-#if defined(GGML_USE_CUBLAS)
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
- n_tasks = 1; // TODO: this actually is doing nothing
- // the threads are still spinning
- }
-#elif defined(GGML_USE_CLBLAST)
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
- n_tasks = 1; // TODO: this actually is doing nothing
- // the threads are still spinning
- }
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
- n_tasks = 1; // TODO: this actually is doing nothing
- // the threads are still spinning
- }
-#endif
} break;
case GGML_OP_MUL_MAT_ID:
{
state->shared->node_n += 1;
return (thread_ret_t) GGML_EXIT_ABORTED;
}
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
// all other threads are finished and spinning
// do finalize and init here so we don't have synchronize again
} else {
// wait for other threads to finish
const int last = node_n;
+
+ const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
+
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- sched_yield();
-#endif
+ // UPD: adding the do_yield flag seems to resolve the issue universally
+ if (do_yield) {
+ sched_yield();
+ }
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
} else
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+ if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);