};
struct ggml_numa_nodes {
+ enum ggml_numa_strategy numa_strategy;
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
+ uint32_t current_node; // node on which main process is execting
+#ifdef __linux__
+ cpu_set_t cpuset; // cpuset from numactl
+#else
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
+#endif
};
//
atomic_fetch_sub(&g_state_barrier, 1);
}
-void ggml_numa_init(void) {
+#ifdef __linux__
+static cpu_set_t ggml_get_numa_affinity(void) {
+ cpu_set_t cpuset;
+ pthread_t thread;
+ thread = pthread_self();
+ CPU_ZERO(&cpuset);
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+ return cpuset;
+}
+#else
+static uint32_t ggml_get_numa_affinity(void) {
+ return 0; // no NUMA support
+}
+#endif
+
+void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
if (g_state.numa.n_nodes > 0) {
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
char path[256];
int rv;
+ // set numa scheme
+ g_state.numa.numa_strategy = numa_flag;
+
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
+
+ g_state.numa.cpuset = ggml_get_numa_affinity();
+
// enumerate nodes
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+ // figure out which node we're on
+ uint current_cpu;
+ int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
+
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
g_state.numa.n_nodes = 0;
return;
}
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
+
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
struct ggml_numa_node * node = &g_state.numa.nodes[n];
GGML_PRINT_DEBUG("CPUs on node %u:", n);
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__linux__) && !defined(__BIONIC__)
-static void set_numa_thread_affinity(int thread_n, int n_threads) {
+static void set_numa_thread_affinity(int thread_n) {
if (!ggml_is_numa()) {
return;
}
- // run thread on node_num thread_n / (threads per node)
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+ int node_num;
+ int rv;
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+ switch(g_state.numa.numa_strategy) {
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
+ // run thread on node_num thread_n / (threads per node)
+ node_num = thread_n % g_state.numa.n_nodes;
+ break;
+ case GGML_NUMA_STRATEGY_ISOLATE:
+ // run thread on current_node
+ node_num = g_state.numa.current_node;
+ break;
+ case GGML_NUMA_STRATEGY_NUMACTL:
+ // use the cpuset that numactl gave us
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
+ if (rv) {
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
+ }
+ return;
+ default:
+ return;
+ }
+
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
- strerror(rv));
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
}
CPU_FREE(cpus);
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
- strerror(rv));
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
}
CPU_FREE(cpus);
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
+static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
static void clear_numa_thread_affinity(void) {}
#endif
const int n_threads = state->shared->n_threads;
- set_numa_thread_affinity(state->ith, n_threads);
+ set_numa_thread_affinity(state->ith);
int node_n = -1;
int task_phase = GGML_TASK_FINALIZE;