// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
+
#ifdef __cplusplus
}
#endif
#endif
}
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
+ atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
+ return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
#if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) {
cpu_set_t cpuset;
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
-#include <atomic>
#include <array>
#include <type_traits>
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
- static std::atomic<int64_t> current_chunk;
-
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
- std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+ ggml_threadpool_chunk_set(params->threadpool, params->nth);
}
ggml_barrier(params->threadpool);
GGML_ASSERT(jj == jj2);
}
- // next step.
- job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
+ job = ggml_threadpool_chunk_add(params->threadpool, 1);
}
ggml_barrier(params->threadpool);