ggml : add numa options (llama/5377)

author bmwl <redacted>

Fri, 16 Feb 2024 09:31:07 +0000 (01:31 -0800)

committer Georgi Gerganov <redacted>

Mon, 19 Feb 2024 13:53:23 +0000 (15:53 +0200)
author bmwl <redacted>
Fri, 16 Feb 2024 09:31:07 +0000 (01:31 -0800)
committer Georgi Gerganov <redacted>
Mon, 19 Feb 2024 13:53:23 +0000 (15:53 +0200)
diff --git a/ggml.c b/ggml.c

index d921d82fed7d3358ef08d85042fd08a60770f915..4e302fb7de2f48c2e4d5db594ec492eb3e671f21 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -1954,9 +1954,16 @@ struct ggml_numa_node {
  };
  
  struct ggml_numa_nodes {
+    enum ggml_numa_strategy numa_strategy;
      struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
      uint32_t n_nodes;
      uint32_t total_cpus; // hardware threads on system
+    uint32_t current_node; // node on which main process is execting
+#ifdef __linux__
+    cpu_set_t cpuset; // cpuset from numactl
+#else
+    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
+#endif
  };
  
  //
@@ -1990,7 +1997,22 @@ inline static void ggml_critical_section_end(void) {
      atomic_fetch_sub(&g_state_barrier, 1);
  }
  
-void ggml_numa_init(void) {
+#ifdef __linux__
+static cpu_set_t ggml_get_numa_affinity(void) {
+    cpu_set_t cpuset;
+    pthread_t thread;
+    thread = pthread_self();
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+    return cpuset;
+}
+#else
+static uint32_t ggml_get_numa_affinity(void) {
+    return 0; // no NUMA support
+}
+#endif
+
+void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
      if (g_state.numa.n_nodes > 0) {
          fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
  
@@ -2002,6 +2024,13 @@ void ggml_numa_init(void) {
      char path[256];
      int rv;
  
+    // set numa scheme
+    g_state.numa.numa_strategy = numa_flag;
+
+    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
+
+    g_state.numa.cpuset = ggml_get_numa_affinity();
+
      // enumerate nodes
      while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
          rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -2020,11 +2049,17 @@ void ggml_numa_init(void) {
  
      GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
  
-    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+    // figure out which node we're on
+    uint current_cpu;
+    int getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
+
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
          g_state.numa.n_nodes = 0;
          return;
      }
  
+    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
+
      for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
          struct ggml_numa_node * node = &g_state.numa.nodes[n];
          GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -16638,26 +16673,46 @@ typedef pthread_t ggml_thread_t;
  
  // Android's libc implementation "bionic" does not support setting affinity
  #if defined(__linux__) && !defined(__BIONIC__)
-static void set_numa_thread_affinity(int thread_n, int n_threads) {
+static void set_numa_thread_affinity(int thread_n) {
      if (!ggml_is_numa()) {
          return;
      }
  
-    // run thread on node_num thread_n / (threads per node)
-    const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
-    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+    int node_num;
+    int rv;
      size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
  
+    switch(g_state.numa.numa_strategy) {
+        case GGML_NUMA_STRATEGY_DISTRIBUTE:
+            // run thread on node_num thread_n / (threads per node)
+            node_num = thread_n % g_state.numa.n_nodes;
+            break;
+        case GGML_NUMA_STRATEGY_ISOLATE:
+            // run thread on current_node
+            node_num = g_state.numa.current_node;
+            break;
+        case GGML_NUMA_STRATEGY_NUMACTL:
+            // use the cpuset that numactl gave us
+            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
+            if (rv) {
+                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
+            }
+            return;
+        default:
+            return;
+    }
+
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+
      cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
      CPU_ZERO_S(setsize, cpus);
      for (size_t i = 0; i < node->n_cpus; ++i) {
          CPU_SET_S(node->cpus[i], setsize, cpus);
      }
  
-    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
      if (rv) {
-            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
-                    strerror(rv));
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
      }
  
      CPU_FREE(cpus);
@@ -16678,8 +16733,7 @@ static void clear_numa_thread_affinity(void) {
  
      int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
      if (rv) {
-        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
-            strerror(rv));
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
      }
  
      CPU_FREE(cpus);
@@ -16687,7 +16741,7 @@ static void clear_numa_thread_affinity(void) {
  #else
  // TODO: Windows etc.
  // (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
  static void clear_numa_thread_affinity(void) {}
  #endif
  
@@ -16987,7 +17041,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
      const int   n_threads   = state->shared->n_threads;
  
-    set_numa_thread_affinity(state->ith, n_threads);
+    set_numa_thread_affinity(state->ith);
  
      int node_n     = -1;
      int task_phase = GGML_TASK_FINALIZE;
diff --git a/ggml.h b/ggml.h

index 01cecc1e1845ffceafdfed44a2b5de78d78ca41b..270018185f397c664cd9f5045b26198a405a9a63 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -658,6 +658,16 @@ extern "C" {
          void * wdata;
      };
  
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
      // misc
  
      GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
@@ -668,7 +678,7 @@ extern "C" {
  
      GGML_API void    ggml_print_backtrace(void);
  
-    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
+    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
      GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
  
      GGML_API void    ggml_print_object (const struct ggml_object * obj);
author	bmwl <redacted>
	Fri, 16 Feb 2024 09:31:07 +0000 (01:31 -0800)
committer	Georgi Gerganov <redacted>
	Mon, 19 Feb 2024 13:53:23 +0000 (15:53 +0200)