updated the n_task calculation to use max number of
threads possible. This has improved the prompt eval
performance by around 5% for DOT kernels and by
around 10% for MMLA kernels on AWS Graviton3.
} break;
case GGML_OP_SOFT_MAX:
{
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+ n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
{