}
#endif
-/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
#define GGML_FREE(ptr) free(ptr)
#define UNUSED GGML_UNUSED
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
+#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
return CLOCKS_PER_SEC/1000;
}
-#ifdef GGML_PERF
-#define ggml_perf_time_ms() ggml_time_ms()
-#define ggml_perf_time_us() ggml_time_us()
-#define ggml_perf_cycles() ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms() 0
-#define ggml_perf_time_us() 0
-#define ggml_perf_cycles() 0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
//
// cross-platform UTF-8 file paths
//
int n_objects;
- struct ggml_object* objects_begin;
- struct ggml_object* objects_end;
+ struct ggml_object * objects_begin;
+ struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
};
struct ggml_compute_state_shared {
- const struct ggml_cgraph* cgraph;
- const struct ggml_cplan* cplan;
-
- int64_t perf_node_start_cycles;
- int64_t perf_node_start_time_us;
+ const struct ggml_cgraph * cgraph;
+ const struct ggml_cplan * cplan;
int n_threads;
atomic_int n_barrier_passed;
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
- void* abort_callback_data;
+ void * abort_callback_data;
- atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+ atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
+
+ enum ggml_status ec;
};
struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
- struct ggml_compute_state_shared* shared;
- enum ggml_status ec;
+ struct ggml_compute_state_shared * shared;
+};
+
+struct ggml_compute_params {
+ // ith = thread index, nth = number of threads
+ int ith, nth;
+
+ // work buffer for all threads
+ size_t wsize;
+ void * wdata;
+
+ struct ggml_compute_state_shared * shared;
};
//
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-// WARN:
-// Mis-configuration can lead to problem that's hard to reason about:
-// * At best it crash or talks nosense.
-// * At worst it talks slightly difference but hard to perceive.
-//
-// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
-// Take care about compile options (e.g., GGML_USE_xxx).
-static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
-static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
-
-static void ggml_setup_op_has_task_pass(void) {
- { // INIT
- bool * p = GGML_OP_HAS_INIT;
-
- p[GGML_OP_ACC ] = true;
- p[GGML_OP_MUL_MAT ] = true;
- p[GGML_OP_MUL_MAT_ID ] = true;
- p[GGML_OP_OUT_PROD ] = true;
- p[GGML_OP_SET ] = true;
- p[GGML_OP_GET_ROWS_BACK ] = true;
- p[GGML_OP_DIAG_MASK_INF ] = true;
- p[GGML_OP_DIAG_MASK_ZERO ] = true;
- p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
- p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
- p[GGML_OP_FLASH_ATTN_BACK ] = true;
- p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
- p[GGML_OP_ADD_REL_POS ] = true;
- }
-
- { // FINALIZE
- bool * p = GGML_OP_HAS_FINALIZE;
-
- p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
- }
-}
-
//
// NUMA support
//
static struct ggml_state g_state;
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
-// barrier via spin lock
+// critical section via spin lock
inline static void ggml_critical_section_start(void) {
while (atomic_flag_test_and_set(&g_state_critical)) {
// spin
}
}
+#ifdef GGML_USE_OPENMP
+static void ggml_barrier(struct ggml_compute_state_shared * shared) {
+ if (shared->n_threads == 1) {
+ return;
+ }
+
+ #pragma omp barrier
+}
+#else
+static void ggml_barrier(struct ggml_compute_state_shared * shared) {
+ if (shared->n_threads == 1) {
+ return;
+ }
+
+ atomic_int * n_barrier = &shared->n_barrier;
+ atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+
+ int n_threads = shared->n_threads;
+ int passed_old = atomic_load(n_barrier_passed);
+
+ if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
+ // last thread
+ atomic_store(n_barrier, 0);
+ atomic_fetch_add(n_barrier_passed, 1);
+ } else {
+ // wait for other threads
+ const int n_spin_before_sleep = 100000;
+ while (true) {
+ for (int i = 0; i < n_spin_before_sleep; i++) {
+ if (atomic_load(n_barrier_passed) != passed_old) {
+ return;
+ }
+ #if defined(__SSE3__)
+ _mm_pause();
+ #endif
+ }
+ sched_yield();
+ }
+ }
+}
+#endif
+
// TODO: make this somehow automatically executed
// some sort of "sentry" mechanism
inline static void ggml_critical_section_end(void) {
}
}
#else
- GGML_UNUSED(numa_flag);
+ UNUSED(numa_flag);
// TODO
#endif
}
enum ggml_unary_op uop = ggml_get_unary_op(t);
return ggml_unary_op_name(uop);
}
- else {
- return ggml_op_name(t->op);
- }
+ return ggml_op_name(t->op);
}
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
- ggml_setup_op_has_task_pass();
-
is_first_call = false;
}
/*.flags =*/ 0,
/*.grad =*/ NULL,
/*.src =*/ { NULL },
- /*.perf_runs =*/ 0,
- /*.perf_cycles =*/ 0,
- /*.perf_time_us =*/ 0,
/*.view_src =*/ view_src,
/*.view_offs =*/ view_offs,
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.extra =*/ NULL,
- /*.padding =*/ { 0 },
+ ///*.padding =*/ { 0 },
};
#ifdef __clang__
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == dst->type);
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const size_t nb00 = src0->nb[0];
const size_t nb0 = dst->nb[0];
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type);
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scalar to add
const float v = *(float *) src1->data;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scalar to add
const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scalar to add
const float v = *(float *) src1->data;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scalar to add
const float v = *(float *) src1->data;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_scalar(src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scalar to add
const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
- if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
- if (params->ith != 0) {
- return;
+ if (!inplace) {
+ if (params->ith == 0) {
+ // memcpy needs to be synchronized across threads to avoid race conditions.
+ // => do it in INIT phase
+ memcpy(
+ ((char *) dst->data),
+ ((char *) src0->data),
+ ggml_nbytes(dst));
}
- // memcpy needs to be synchronized across threads to avoid race conditions.
- // => do it in INIT phase
- memcpy(
- ((char *) dst->data),
- ((char *) src0->data),
- ggml_nbytes(dst));
- }
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ ggml_barrier(params->shared);
}
const int ith = params->ith;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- assert(params->ith == 0);
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
- assert(ggml_are_same_shape(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_are_same_shape(src0, dst));
+
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
- assert(ggml_are_same_shape(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_are_same_shape(src0, dst));
+
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
- assert(ggml_is_scalar(dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_is_scalar(dst));
+
+
assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
- assert(ggml_is_scalar(dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_is_scalar(dst));
+
assert(src0->nb[0] == sizeof(ggml_fp16_t));
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
- assert(ggml_is_scalar(dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ assert(ggml_is_scalar(dst));
+
assert(src0->nb[0] == sizeof(ggml_bf16_t));
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
- GGML_ASSERT(ggml_can_repeat(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
+
GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
- GGML_ASSERT(ggml_can_repeat(src0, dst));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
+
GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
- GGML_ASSERT(ggml_can_repeat(dst, src0));
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
+ GGML_ASSERT(ggml_can_repeat(dst, src0));
+
GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(ggml_are_same_shape(src0, dst));
assert(ggml_are_same_shape(src0, grad));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int ith = params->ith;
const int nth = params->nth;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
GGML_ASSERT(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const bool src1_cont = ggml_is_contiguous(src1);
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
// broadcast factors
const int64_t r2 = ne12 / ne02;
static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
- struct ggml_tensor * dst,
- struct ggml_compute_state * state) {
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
- // broadcast factors
- const int64_t r2 = ne12 / ne02;
- const int64_t r3 = ne13 / ne03;
- UNUSED(r2);
- UNUSED(r3);
-
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if GGML_USE_LLAMAFILE
+ // broadcast factors
+ const int64_t r2 = ne12 / ne02;
+ const int64_t r3 = ne13 / ne03;
+
const bool src1_cont = ggml_is_contiguous(src1);
if (src1_cont) {
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
- params->type,
src0->type,
src1->type,
dst->type))
UseGgmlGemm1:;
#endif
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
- // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
- atomic_store(&state->shared->current_chunk, nth);
- if (src1->type != vec_dot_type) {
- char * wdata = params->wdata;
- const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
- assert(params->wsize >= ne11*ne12*ne13*row_size);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
- wdata += row_size;
- }
+ if (src1->type != vec_dot_type) {
+ char * wdata = params->wdata;
+
+ const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+ const size_t nbw2 = nbw1*ne11;
+ const size_t nbw3 = nbw2*ne12;
+
+ assert(params->wsize >= ne13*nbw3);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+ ne10);
}
}
}
-
- return;
}
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ if (ith == 0) {
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
+ atomic_store(¶ms->shared->current_chunk, nth);
}
+ ggml_barrier(params->shared);
+
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
- params->type,
src0->type,
vec_dot_type,
dst->type))
UseGgmlGemm2:;
#endif
-#ifdef GGML_PERF
- int chunks_executed = 0;
- UNUSED(chunks_executed);
-#endif
-
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
const int64_t nr0 = ne0;
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
- //if (ith == 0)
- // printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
-
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
-#ifdef GGML_PERF
- chunks_executed++;
-#endif
-
if (nth >= nchunk0 * nchunk1) {
break;
}
- current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
+ current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1);
}
-
-#ifdef GGML_PERF
- // These numbers are useful when trying to measure how well the threading scheduling works.
- //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
- //float time = (ggml_perf_time_us() - t0);
- //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
-#endif
}
// ggml_compute_forward_mul_mat_id
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
- if (src1->type != vec_dot_type) {
- const size_t row_size = ggml_row_size(vec_dot_type, ne10);
- assert(params->wsize >= ne11*ne12*ne13*row_size);
- assert(src1->type == GGML_TYPE_F32);
+ const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+ const size_t nbw2 = nbw1*ne11;
+ const size_t nbw3 = nbw2*ne12;
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
- wdata += row_size;
- }
+ assert(params->wsize >= ne13*nbw3);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+ ne10);
}
}
}
+ }
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
+
+ if (ith == 0) {
// initialize matrix_row_counts
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
-#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
-
// group rows by src0 matrix
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
for (int id = 0; id < n_ids; ++id) {
matrix_row_counts[i02] += 1;
}
}
-
- return;
}
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
+ ggml_barrier(params->shared);
// compute each matrix multiplication in sequence
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- // int64_t t0 = ggml_perf_time_us();
- // UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (ith == 0) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
}
+ ggml_barrier(params->shared);
// dst[:,:,:,:] = 0
// for i2,i3:
}
}
}
-
- //int64_t t1 = ggml_perf_time_us();
- //static int64_t acc = 0;
- //acc += t1 - t0;
- //if (t1 - t0 > 10) {
- // printf("\n");
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
- // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
- //}
}
static void ggml_compute_forward_out_prod_q_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- // int64_t t0 = ggml_perf_time_us();
- // UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS;
const int ith = params->ith;
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (ith == 0) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
}
+ ggml_barrier(params->shared);
// parallelize by last three dimensions
ggml_vec_mad_f32(ne0, d, wdata, *s1);
}
}
-
- //int64_t t1 = ggml_perf_time_us();
- //static int64_t acc = 0;
- //acc += t1 - t0;
- //if (t1 - t0 > 10) {
- // printf("\n");
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
- // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
- //}
}
static void ggml_compute_forward_out_prod(
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// scale factor
float v;
memcpy(&v, dst->op_params, sizeof(float));
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
- if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
- if (params->ith != 0) {
- return;
+ if (!inplace) {
+ if (params->ith == 0) {
+ // memcpy needs to be synchronized across threads to avoid race conditions.
+ // => do it in INIT phase
+ memcpy(
+ ((char *) dst->data),
+ ((char *) src0->data),
+ ggml_nbytes(dst));
}
- // memcpy needs to be synchronized across threads to avoid race conditions.
- // => do it in INIT phase
- memcpy(
- ((char *) dst->data),
- ((char *) src0->data),
- ggml_nbytes(dst));
- }
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ ggml_barrier(params->shared);
}
const int ith = params->ith;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00;
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- GGML_ASSERT(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
GGML_ASSERT(ggml_is_contiguous(dst));
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (params->ith != 0) {
- return;
- }
- memset(dst->data, 0, ggml_nbytes(dst));
- }
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
+ memset(dst->data, 0, ggml_nbytes(dst));
const int nc = src0->ne[0];
const int nr = ggml_nelements(src1);
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- GGML_ASSERT(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
GGML_ASSERT(ggml_is_contiguous(dst));
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (params->ith != 0) {
- return;
- }
- memset(dst->data, 0, ggml_nbytes(dst));
- }
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
+ memset(dst->data, 0, ggml_nbytes(dst));
const int nc = src0->ne[0];
const int nr = ggml_nelements(src1);
const struct ggml_tensor * src0 = dst->src[0];
- GGML_ASSERT(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
GGML_ASSERT(n_past >= 0);
- if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
- if (ith != 0) {
- return;
+ if (!inplace) {
+ if (ith == 0) {
+ // memcpy needs to be synchronized across threads to avoid race conditions.
+ // => do it in INIT phase
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+ GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+ memcpy(
+ ((char *) dst->data),
+ ((char *) src0->data),
+ ggml_nbytes(dst));
}
- // memcpy needs to be synchronized across threads to avoid race conditions.
- // => do it in INIT phase
- GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
- GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
- memcpy(
- ((char *) dst->data),
- ((char *) src0->data),
- ggml_nbytes(dst));
- }
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ ggml_barrier(params->shared);
}
// TODO: handle transposed/permuted matrices
assert(ggml_is_contiguous(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
float scale = 1.0f;
float max_bias = 0.0f;
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src1, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// TODO: handle transposed/permuted matrices
const int ith = params->ith;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src2 = dst->src[2];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
//const int n_past = ((int32_t *) dst->op_params)[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src2 = dst->src[2];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
//const int n_past = ((int32_t *) dst->op_params)[0];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (ith == 0) {
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
-
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
}
+ ggml_barrier(params->shared);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (ith == 0) {
memset(params->wdata, 0, params->wsize);
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
-
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
}
+ ggml_barrier(params->shared);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS;
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
- if (params->type == GGML_TASK_TYPE_INIT) {
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
{
float * const wdata = (float *) dst->data;
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16);
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS;
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
- if (params->type == GGML_TASK_TYPE_INIT) {
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
+ if (ith == 0) {
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
}
memset(dst->data, 0, ggml_nbytes(dst));
-
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
}
+ ggml_barrier(params->shared);
const int32_t stride = ggml_get_op_params_i32(dst, 0);
const struct ggml_tensor * src = dst->src[0];
assert(src->type == GGML_TYPE_F32);
- assert(params->ith == 0);
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
- GGML_ASSERT(params->ith == 0);
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int ith = params->ith;
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT( dst->nb[0] == sizeof(float));
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_ASSERT(dst->nb[0] == sizeof(float));
const int ith = params->ith;
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->nb[0] == sizeof(float));
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(nb0 == sizeof(float));
const struct ggml_tensor * v,
const struct ggml_tensor * mask,
struct ggml_tensor * dst) {
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
const int64_t rv2 = neq2/nev2;
const int64_t rv3 = neq3/nev3;
- if (params->type == GGML_TASK_TYPE_INIT) {
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// parallelize by q rows using ggml_vec_dot_f32
// total rows in q
const struct ggml_tensor * v = dst->src[2];
const struct ggml_tensor * d = dst->src[3];
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith == 0) {
- memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
- }
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ if (ith == 0) {
+ memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
}
+ ggml_barrier(params->shared);
const int64_t elem_q = ggml_nelements(q);
const int64_t elem_k = ggml_nelements(k);
static void ggml_compute_forward_ssm_conv_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const struct ggml_tensor * src0 = dst->src[0]; // conv_state
const struct ggml_tensor * src1 = dst->src[1]; // x
const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
static void ggml_compute_forward_ssm_scan_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const struct ggml_tensor * src0 = dst->src[0]; // s
const struct ggml_tensor * src1 = dst->src[1]; // x
const struct ggml_tensor * src2 = dst->src[2]; // dt
static void ggml_compute_forward_win_part_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
+ UNUSED(params);
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
static void ggml_compute_forward_win_unpart_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
+ UNUSED(params);
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
static void ggml_compute_forward_get_rel_pos_f16(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
+ UNUSED(params);
const struct ggml_tensor * src0 = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
GGML_TENSOR_UNARY_OP_LOCALS
const struct ggml_tensor * src2 = dst->src[2];
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
- if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
- if (params->ith != 0) {
- return;
+ if (!inplace) {
+ if (params->ith == 0) {
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
}
- memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
- return;
- }
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
+ ggml_barrier(params->shared);
}
-
- int64_t t0 = ggml_perf_time_us();
- UNUSED(t0);
-
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
float * src1_data = (float *) src1->data;
const struct ggml_tensor * src0 = dst->src[0];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- assert(params->ith == 0);
+ if (params->ith != 0) {
+ return;
+ }
+
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(src1));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const struct ggml_tensor * a = dst->src[0];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * a = dst->src[0];
const struct ggml_tensor * b = dst->src[1];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * b = dst->src[1];
const struct ggml_tensor * c = dst->src[1];
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ if (params->ith != 0) {
return;
}
const struct ggml_tensor * a = dst->src[0];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
struct ggml_map_custom1_op_params p;
memcpy(&p, dst->op_params, sizeof(p));
const struct ggml_tensor * a = dst->src[0];
const struct ggml_tensor * b = dst->src[1];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
struct ggml_map_custom2_op_params p;
memcpy(&p, dst->op_params, sizeof(p));
const struct ggml_tensor * b = dst->src[1];
const struct ggml_tensor * c = dst->src[2];
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
struct ggml_map_custom3_op_params p;
memcpy(&p, dst->op_params, sizeof(p));
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith == 0) {
- memset(sums, 0, sizeof(float) * (nth + nth * nc));
- }
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- if (ith == 0) {
- float * dp = (float *) dst->data;
- ggml_vec_sum_f32(nth, dp, sums);
- dp[0] *= -1.0f / (float) nr;
- }
- return;
+ if (ith == 0) {
+ memset(sums, 0, sizeof(float) * (nth + nth * nc));
}
+ ggml_barrier(params->shared);
const double eps = 1e-9;
}
#endif
}
+ ggml_barrier(params->shared);
+ if (ith == 0) {
+ float * dp = (float *) dst->data;
+ ggml_vec_sum_f32(nth, dp, sums);
+ dp[0] *= -1.0f / (float) nr;
+ }
}
static void ggml_compute_forward_cross_entropy_loss(
const int64_t ith = params->ith;
const int64_t nth = params->nth;
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
const double eps = 1e-9;
// TODO: handle transposed/permuted matrices
/////////////////////////////////
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
} break;
case GGML_OP_MUL_MAT:
{
- ggml_compute_forward_mul_mat(params, tensor, state);
+ ggml_compute_forward_mul_mat(params, tensor);
} break;
case GGML_OP_MUL_MAT_ID:
{
/*.leafs =*/ leafs_ptr,
/*.hash_table =*/ { hash_size, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
- /*.perf_runs =*/ 0,
- /*.perf_cycles =*/ 0,
- /*.perf_time_us =*/ 0,
};
return cgraph;
/*.leafs =*/ NULL,
/*.hash_table =*/ { 0, NULL },
/*.order =*/ cgraph0->order,
- /*.perf_runs =*/ 0,
- /*.perf_cycles =*/ 0,
- /*.perf_time_us =*/ 0,
};
return cgraph;
static void clear_numa_thread_affinity(void) {}
#endif
-static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
-
- node->perf_runs++;
- node->perf_cycles += cycles_cur;
- node->perf_time_us += time_us_cur;
-}
-
-static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
+static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
int n_tasks = 0;
if (ggml_is_empty(node)) {
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
- case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
- case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
+ case GGML_UNARY_OP_HARDSWISH:
+ case GGML_UNARY_OP_HARDSIGMOID:
{
n_tasks = 1;
} break;
case GGML_OP_RMS_NORM_BACK:
case GGML_OP_GROUP_NORM:
case GGML_OP_CONCAT:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_MUL_MAT:
- {
- n_tasks = n_threads;
-
- // TODO: use different scheduling for different matrix sizes
- //const int nr0 = ggml_nrows(node->src[0]);
- //const int nr1 = ggml_nrows(node->src[1]);
-
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
- } break;
case GGML_OP_MUL_MAT_ID:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_OUT_PROD:
{
n_tasks = n_threads;
} break;
case GGML_OP_GET_ROWS:
{
- // FIXME: the cost of launching additional threads decreases performance with GPU offloading
- //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
- n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
+ // FIXME: get_rows can use additional threads, but the cost of launching additional threads
+ // decreases performance with GPU offloading
+ //n_tasks = n_threads;
+ n_tasks = 1;
} break;
case GGML_OP_SCALE:
case GGML_OP_SET:
{
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
} break;
- case GGML_OP_CONV_TRANSPOSE_1D:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_IM2COL:
- {
- n_tasks = n_threads;
- } break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
{
n_tasks = n_threads;
n_tasks = 1;
} break;
case GGML_OP_UPSCALE:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_PAD:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_ARANGE:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_TIMESTEP_EMBEDDING:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_ARGSORT:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_FLASH_ATTN_EXT:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_FLASH_ATTN_BACK:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
{
}
} break;
case GGML_OP_CROSS_ENTROPY_LOSS:
- {
- n_tasks = n_threads;
- } break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
{
n_tasks = n_threads;
return n_tasks;
}
-#ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state * state) {
- if (state->shared->n_threads == 1) {
- return;
- }
-
- #pragma omp barrier
-}
-#else
-static void ggml_barrier(struct ggml_compute_state * state) {
- if (state->shared->n_threads == 1) {
- return;
- }
-
- atomic_int * n_barrier = &state->shared->n_barrier;
- atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
-
- int n_threads = state->shared->n_threads;
- int passed_old = atomic_load(n_barrier_passed);
-
- if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
- // last thread
- atomic_store(n_barrier, 0);
- atomic_fetch_add(n_barrier_passed, 1);
- } else {
- // wait for other threads
- //while (atomic_load(n_barrier_passed) == passed_old) {
- //}
- const int n_spin_before_sleep = 100000;
- while (true) {
- for (int i = 0; i < n_spin_before_sleep; i++) {
- if (atomic_load(n_barrier_passed) != passed_old) {
- return;
- }
- #if defined(__SSE3__)
- _mm_pause();
- #endif
- }
- sched_yield();
- }
- }
-}
-#endif
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
- struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
- const struct ggml_cgraph * cgraph = state->shared->cgraph;
- const struct ggml_cplan * cplan = state->shared->cplan;
-
- const int ith = state->ith;
- const int n_threads = state->shared->n_threads;
-
- set_numa_thread_affinity(ith);
-
- struct ggml_compute_params params = {
- /*.type =*/ GGML_TASK_TYPE_INIT,
- /*.ith =*/ ith,
- /*.nth =*/ state->shared->n_threads,
- /*.wsize =*/ cplan->work_size,
- /*.wdata =*/ cplan->work_data,
- };
-
- for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
- if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
- state->ec = GGML_STATUS_ABORTED;
- return 0;
- }
-
- struct ggml_tensor * node = cgraph->nodes[node_n];
- const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-
- params.nth = n_tasks;
-
- /* INIT */
- if (GGML_OP_HAS_INIT[node->op]) {
- if (ith < n_tasks) {
- params.type = GGML_TASK_TYPE_INIT;
- ggml_compute_forward(¶ms, node, state);
- }
- ggml_barrier(state);
- }
-
- /* COMPUTE */
- if (ith < n_tasks) {
- params.type = GGML_TASK_TYPE_COMPUTE;
- ggml_compute_forward(¶ms, node, state);
- }
-
- ggml_barrier(state);
-
- /* FINALIZE */
- if (GGML_OP_HAS_FINALIZE[node->op]) {
- if (params.ith == 0) {
- params.type = GGML_TASK_TYPE_FINALIZE;
- ggml_compute_forward(¶ms, node, state);
- }
- ggml_barrier(state);
- }
- }
-
- return 0;
-}
-
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
if (n_threads <= 0) {
n_threads = GGML_DEFAULT_N_THREADS;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
- const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
max_tasks = MAX(max_tasks, n_tasks);
return cplan;
}
-static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
- enum ggml_status compute_status = GGML_STATUS_SUCCESS;
+static thread_ret_t ggml_graph_compute_thread(void * data) {
+ struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-#ifdef GGML_USE_OPENMP
- if (n_threads > 1) {
- #pragma omp parallel num_threads(n_threads)
- {
- #pragma omp single
- {
- // update the number of threads from the actual number of threads that we got from OpenMP
- n_threads = omp_get_num_threads();
- workers[0].shared->n_threads = n_threads;
- }
- ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
- }
- } else {
- ggml_graph_compute_thread(&workers[0]);
- }
-#else
- // create thread pool
- if (n_threads > 1) {
- for (int j = 1; j < n_threads; ++j) {
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
- GGML_ASSERT(rc == 0);
- UNUSED(rc);
- }
- }
+ const struct ggml_cgraph * cgraph = state->shared->cgraph;
+ const struct ggml_cplan * cplan = state->shared->cplan;
- // this is a work thread too
- ggml_graph_compute_thread(&workers[0]);
+ set_numa_thread_affinity(state->ith);
- // join or kill thread pool
- if (n_threads > 1) {
- for (int j = 1; j < n_threads; j++) {
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
- GGML_ASSERT(rc == 0);
- UNUSED(rc);
+ struct ggml_compute_params params = {
+ /*.ith =*/ state->ith,
+ /*.nth =*/ state->shared->n_threads,
+ /*.wsize =*/ cplan->work_size,
+ /*.wdata =*/ cplan->work_data,
+ /*.shared=*/ state->shared,
+ };
+
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+ struct ggml_tensor * node = cgraph->nodes[node_n];
+
+ ggml_compute_forward(¶ms, node);
+
+ if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+ state->shared->ec = GGML_STATUS_ABORTED;
}
- }
-#endif
- // don't leave affinity set on the main thread
- clear_numa_thread_affinity();
- for (int j = 0; j < n_threads; j++) {
- if (workers[j].ec != GGML_STATUS_SUCCESS) {
- compute_status = workers[j].ec;
+ ggml_barrier(state->shared);
+
+ if (state->shared->ec != GGML_STATUS_SUCCESS) {
break;
}
}
- return compute_status;
+
+ return 0;
}
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
- {
- GGML_ASSERT(cplan);
- GGML_ASSERT(cplan->n_threads > 0);
-
- if (cplan->work_size > 0) {
- GGML_ASSERT(cplan->work_data);
- }
- }
+ GGML_ASSERT(cplan);
+ GGML_ASSERT(cplan->n_threads > 0);
+ GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
int n_threads = cplan->n_threads;
-#if defined(GGML_USE_OPENMP)
- n_threads = MIN(n_threads, omp_get_max_threads());
-#endif
-
struct ggml_compute_state_shared state_shared = {
/*.cgraph =*/ cgraph,
/*.cgraph_plan =*/ cplan,
- /*.perf_node_start_cycles =*/ 0,
- /*.perf_node_start_time_us =*/ 0,
/*.n_threads =*/ n_threads,
/*.n_barrier =*/ 0,
/*.n_barrier_passed =*/ 0,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
- /*.current_chunk; =*/ 0,
+ /*.current_chunk =*/ 0,
+ /*.ec =*/ GGML_STATUS_SUCCESS,
};
+
+#ifdef GGML_USE_OPENMP
+ if (n_threads > 1) {
+ #pragma omp parallel num_threads(n_threads)
+ {
+ #pragma omp single
+ {
+ // update the number of threads from the actual number of threads that we got from OpenMP
+ n_threads = omp_get_num_threads();
+ state_shared.n_threads = n_threads;
+ }
+
+ struct ggml_compute_state worker = {
+ .thrd = 0,
+ .ith = omp_get_thread_num(),
+ .shared = &state_shared,
+ };
+ ggml_graph_compute_thread(&worker);
+ }
+ } else {
+ struct ggml_compute_state worker = {
+ .thrd = 0,
+ .ith = 0,
+ .shared = &state_shared,
+ };
+ ggml_graph_compute_thread(&worker);
+ }
+#else
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
- const int64_t perf_start_cycles = ggml_perf_cycles();
- const int64_t perf_start_time_us = ggml_perf_time_us();
for (int j = 0; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
- .ec = GGML_STATUS_SUCCESS,
};
}
- enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
-
- // performance stats (graph)
- {
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
+ // create thread pool
+ for (int j = 1; j < n_threads; ++j) {
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+ GGML_ASSERT(rc == 0);
+ UNUSED(rc);
+ }
- cgraph->perf_runs++;
- cgraph->perf_cycles += perf_cycles_cur;
- cgraph->perf_time_us += perf_time_us_cur;
+ // this is a work thread too
+ ggml_graph_compute_thread(&workers[0]);
- GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
- __func__, cgraph->perf_runs,
- (double) perf_cycles_cur / (double) ggml_cycles_per_ms(),
- (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
- (double) perf_time_us_cur / 1000.0,
- (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
+ // join or kill thread pool
+ if (n_threads > 1) {
+ for (int j = 1; j < n_threads; j++) {
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
+ GGML_ASSERT(rc == 0);
+ UNUSED(rc);
+ }
}
+#endif
+
+ // don't leave affinity set on the main thread
+ clear_numa_thread_affinity();
- return compute_status;
+ return state_shared.ec;
}
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
}
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
- int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
-
GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
- perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
-
- GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
+ GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
i,
node->ne[0], node->ne[1], node->ne[2],
- ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
- (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
- (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
- (double) node->perf_time_us / 1000.0,
- (double) node->perf_time_us / 1000.0 / node->perf_runs);
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
}
GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
ggml_get_name(node));
}
- for (int i = 0; i < GGML_OP_COUNT; i++) {
- if (perf_total_per_op_us[i] == 0) {
- continue;
- }
-
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
- }
-
GGML_PRINT("========================================\n");
}