bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) {
+ if (params->ith != 0) {
+ return;
+ }
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
- if (params->ith != 0) {
- return;
- }
+ const int64_t ne_plane = ne01*ne00;
+ const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
+ UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) {
+ if (type != GGML_TYPE_F32) {
+ assert(params->wsize >= desired_wsize);
+ // parallelize by src0 rows
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ // broadcast src0 into src1 across 2nd,3rd dimension
+ const int64_t i03 = i13/r3;
+ const int64_t i02 = i12/r2;
+
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+ ggml_to_float_t const to_float = type_traits[type].to_float;
+
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+ to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
+ }
+ }
+ }
+ }
return;
}
return;
}
+ // perform sgemm, parallelization controlled by blas lib
+ if (ith != 0) {
+ return;
+ }
+
+ const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
- // broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
- float * const wdata = params->wdata;
- ggml_to_float_t const to_float = type_traits[type].to_float;
-
- size_t id = 0;
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
- to_float((const char *) x + i01*nb01, wdata + id, ne00);
- id += ne00;
- }
-
- assert(id*sizeof(float) <= params->wsize);
- x = wdata;
+ x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
0.0f, d, ne01);
}
}
+ //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
#endif
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
char * wdata = params->wdata;
if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
return;
}
#endif
+ if (ith != 0) {
+ return;
+ }
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return;
}
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return;
}
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) {
+ if (params->ith != 0) {
+ return;
+ }
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) {
+ if (params->ith != 0) {
+ return;
+ }
memset(dst->data, 0, ggml_nbytes(dst));
}
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) {
+ if (params->ith != 0) {
+ return;
+ }
memset(dst->data, 0, ggml_nbytes(dst));
}
GGML_ASSERT(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) {
+ if (ith != 0) {
+ return;
+ }
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
memset(params->wdata, 0, params->wsize);
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
+ if (ith != 0) {
+ return;
+ }
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
if (!inplace && params->type == GGML_TASK_INIT) {
+ if (params->ith != 0) {
+ return;
+ }
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
return;
}
const int n_threads;
// synchronization primitives
- atomic_int n_active; // num active threads
- atomic_int node_n; // active graph node
+ atomic_int n_active; // num active threads
+ atomic_int node_n; // active graph node
+ atomic_int node_task; // active graph node task phase
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
return n_tasks;
}
+static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
+ // wait for other threads to finish
+ const int last_node_n = * node_n;
+
+ while (true) {
+ if (do_yield) {
+ sched_yield();
+ }
+
+ * node_n = atomic_load(&state->shared->node_n);
+ if (* node_n != last_node_n) break;
+ }
+}
+
+static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
+ // wait for other threads to finish
+ const int last_task_phase = * task_phase;
+
+ while (true) {
+ if (do_yield) {
+ sched_yield();
+ }
+
+ * task_phase = atomic_load(&state->shared->node_task);
+ if (* task_phase != last_task_phase) break;
+ }
+}
+
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
set_numa_thread_affinity(state->ith, n_threads);
- int node_n = -1;
+ int node_n = -1;
+ int task_phase = GGML_TASK_FINALIZE;
while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
params.nth = n_tasks;
- /* INIT */
- if (GGML_OP_HAS_INIT[node->op]) {
- params.type = GGML_TASK_INIT;
- ggml_compute_forward(¶ms, node);
- }
-
if (n_tasks == 1) {
+ /* INIT */
+ if (GGML_OP_HAS_INIT[node->op]) {
+ params.type = GGML_TASK_INIT;
+ ggml_compute_forward(¶ms, node);
+ }
+
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE;
}
}
- atomic_store(&state->shared->n_active, n_threads);
- atomic_store(&state->shared->node_n, node_n);
+ task_phase = GGML_TASK_INIT;
+ atomic_store(&state->shared->n_active, n_threads);
+ atomic_store(&state->shared->node_n, node_n);
+ atomic_store(&state->shared->node_task, task_phase);
} else {
- // wait for other threads to finish
- const int last = node_n;
-
- const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
- while (true) {
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
- // depending on the workload and the operating system.
- // since it is not clear what is the best approach, it should potentially become user-configurable
- // ref: https://github.com/ggerganov/ggml/issues/291
- // UPD: adding the do_yield flag seems to resolve the issue universally
- if (do_yield) {
- sched_yield();
- }
-
- node_n = atomic_load(&state->shared->node_n);
- if (node_n != last) break;
- };
+ ggml_graph_compute_thread_sync_node(&node_n, state, false);
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}
// check if we should stop
if (node_n >= cgraph->n_nodes) break;
- /* COMPUTE */
+ /* INIT & COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads);
struct ggml_compute_params params = {
- /*.type =*/ GGML_TASK_COMPUTE,
+ /*.type =*/ GGML_TASK_INIT,
/*.ith =*/ state->ith,
/*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size,
};
if (state->ith < n_tasks) {
+ if (GGML_OP_HAS_INIT[node->op]) {
+ ggml_compute_forward(¶ms, node);
+ }
+ }
+
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+ task_phase = GGML_TASK_COMPUTE;
+ atomic_store(&state->shared->n_active, n_threads);
+ atomic_store(&state->shared->node_task, task_phase);
+ }
+ else {
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+ // depending on the workload and the operating system.
+ // since it is not clear what is the best approach, it should potentially become user-configurable
+ // ref: https://github.com/ggerganov/ggml/issues/291
+ // UPD: adding the do_yield flag seems to resolve the issue universally
+ const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
+ ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
+ }
+
+ if (state->ith < n_tasks) {
+ params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(¶ms, node);
}
+
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+ task_phase = GGML_TASK_FINALIZE;
+ atomic_store(&state->shared->n_active, n_threads);
+ atomic_store(&state->shared->node_task, task_phase);
+ }
+ else {
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+ }
}
return GGML_EXIT_SUCCESS;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
- // here we need memory just for single 2D matrix from src0
- cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+ // here we need memory for fully dequantized matrix from src0
+ cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
}
} else
#endif
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
+ /*.node_task =*/ GGML_TASK_FINALIZE,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};