static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
+// WARN:
+// Mis-confguration can lead to problem that's hard to reason about:
+// * At best it crash or talks nosense.
+// * At worst it talks slightly difference but hard to perceive.
+//
+// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
+// Take care about compile options (e.g., GGML_USE_xxx).
+static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
+static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
+static void ggml_setup_op_has_task_pass(void) {
+ { // INIT
+ bool * I = GGML_OP_HAS_INIT;
+
+ I[GGML_OP_ACC ] = true;
+ I[GGML_OP_MUL_MAT ] = true;
+ I[GGML_OP_OUT_PROD ] = true;
+ I[GGML_OP_SET ] = true;
+ I[GGML_OP_GET_ROWS_BACK ] = true;
+ I[GGML_OP_DIAG_MASK_INF ] = true;
+ I[GGML_OP_DIAG_MASK_ZERO ] = true;
+ I[GGML_OP_CONV_1D_S1_PH ] = true;
+ I[GGML_OP_CONV_1D_S2_PH ] = true;
+ I[GGML_OP_CONV_2D_SK_P0 ] = true;
+ I[GGML_OP_FLASH_ATTN_BACK ] = true;
+ I[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
+ }
+
+ { // FINALIZE
+ bool * F = GGML_OP_HAS_FINALIZE;
+
+ F[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
+ }
+}
+
//
// ggml context
//
ggml_cl_init();
#endif
+ ggml_setup_op_has_task_pass();
+
is_first_call = false;
}
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
- params.nth = node->n_tasks;
- ggml_compute_forward(¶ms, node);
- ggml_graph_compute_perf_stats_node(node, state->shared);
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
+ params.nth = node->n_tasks;
+ ggml_compute_forward(¶ms, node);
+ ggml_graph_compute_perf_stats_node(node, state->shared);
+ }
}
// distribute new work or execute it direct if 1T
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
+ params.nth = node->n_tasks;
+
/* INIT */
- params.type = GGML_TASK_INIT;
- params.nth = node->n_tasks;
- ggml_compute_forward(¶ms, node);
+ if (GGML_OP_HAS_INIT[node->op]) {
+ params.type = GGML_TASK_INIT;
+ ggml_compute_forward(¶ms, node);
+ }
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(¶ms, node);
- params.type = GGML_TASK_FINALIZE;
- ggml_compute_forward(¶ms, node);
- ggml_graph_compute_perf_stats_node(node, state->shared);
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
+ params.type = GGML_TASK_FINALIZE;
+ ggml_compute_forward(¶ms, node);
+ ggml_graph_compute_perf_stats_node(node, state->shared);
+ }
} else {
break;
}