llama : add thread safety test (llama/14035)

author Diego Devesa <redacted>

Mon, 16 Jun 2025 15:11:43 +0000 (08:11 -0700)

committer Georgi Gerganov <redacted>

Wed, 18 Jun 2025 07:21:15 +0000 (10:21 +0300)
author Diego Devesa <redacted>
Mon, 16 Jun 2025 15:11:43 +0000 (08:11 -0700)
committer Georgi Gerganov <redacted>
Wed, 18 Jun 2025 07:21:15 +0000 (10:21 +0300)
diff --git a/src/ggml-cpu/ggml-cpu-impl.h b/src/ggml-cpu/ggml-cpu-impl.h

index 9662e4d7b5a6a19f00e97905dd2d4417853760fc..ae68cd006336da6f0341ae14c938f87726e6c6ee 100644 (file)
--- a/src/ggml-cpu/ggml-cpu-impl.h
+++ b/src/ggml-cpu/ggml-cpu-impl.h
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
  // TODO: move to ggml-threading
  void ggml_barrier(struct ggml_threadpool * tp);
  
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
+int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c

index ff28bf98bc7df39a0d9d52fa843fccc20a54df9a..2c12e493bc9b01aa103d6b5c179298101a46d1a8 100644 (file)
--- a/src/ggml-cpu/ggml-cpu.c
+++ b/src/ggml-cpu/ggml-cpu.c
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
  #endif
  }
  
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
+    atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
+    return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
  #if defined(__gnu_linux__)
  static cpu_set_t ggml_get_numa_affinity(void) {
      cpu_set_t cpuset;
diff --git a/src/ggml-cpu/llamafile/sgemm.cpp b/src/ggml-cpu/llamafile/sgemm.cpp

index 1d46158f928c4661ff68e345860bf490fd3c80a0..1c545f803327b9d62a033dd7b79b8f319b5d5193 100644 (file)
--- a/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/src/ggml-cpu/llamafile/sgemm.cpp
@@ -53,7 +53,6 @@
  #include "ggml-cpu-impl.h"
  #include "ggml-quants.h"
  
-#include <atomic>
  #include <array>
  #include <type_traits>
  
@@ -394,8 +393,6 @@ class tinyBLAS {
  
      template <int RM, int RN, int BM>
      NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
-        static std::atomic<int64_t> current_chunk;
-
          GGML_ASSERT(m % (RM * BM) == 0);
          const int64_t ytiles = m / (RM * BM);
          const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +407,7 @@ class tinyBLAS {
          if (params->ith == 0) {
              GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
              // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+            ggml_threadpool_chunk_set(params->threadpool, params->nth);
          }
  
          ggml_barrier(params->threadpool);
@@ -439,8 +436,7 @@ class tinyBLAS {
                  GGML_ASSERT(jj == jj2);
              }
  
-            // next step.
-            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+            job = ggml_threadpool_chunk_add(params->threadpool, 1);
          }
  
          ggml_barrier(params->threadpool);
author	Diego Devesa <redacted>
	Mon, 16 Jun 2025 15:11:43 +0000 (08:11 -0700)
committer	Georgi Gerganov <redacted>
	Wed, 18 Jun 2025 07:21:15 +0000 (10:21 +0300)
src/ggml-cpu/ggml-cpu-impl.h		patch \| blob \| history
src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
src/ggml-cpu/llamafile/sgemm.cpp		patch \| blob \| history