]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid...
authorMax Krasnyansky <redacted>
Sat, 31 May 2025 22:39:19 +0000 (15:39 -0700)
committerGeorgi Gerganov <redacted>
Sun, 1 Jun 2025 12:14:44 +0000 (15:14 +0300)
* threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling

We talked about adding LOW priority for GGML threads in the original threadpool PR.
It might be useful for some cases to avoid contention.

Latest Windows ARM64 releases started parking (offlining) the CPU cores
more aggresively which results in suboptimal performance with n_threads > 4.
To deal with that we now disable Power Throttling for our threads for the NORMAL
and higher priorities.

Co-authored-by: Diego Devesa <redacted>
* threading: disable SetThreadInfo() calls for older Windows versions

* Update tools/llama-bench/llama-bench.cpp

Co-authored-by: Diego Devesa <redacted>
---------

Co-authored-by: Diego Devesa <redacted>
ggml/include/ggml.h
ggml/src/ggml-cpu/ggml-cpu.c

index 1cd03e82b61d17f57bd2583196ab04a7a92c0185..1a57f1cd75a31eedb2550c87bedca0d4c175e9dc 100644 (file)
@@ -2178,6 +2178,7 @@ extern "C" {
 
     // scheduling priorities
     enum ggml_sched_priority {
+        GGML_SCHED_PRIO_LOW = -1,
         GGML_SCHED_PRIO_NORMAL,
         GGML_SCHED_PRIO_MEDIUM,
         GGML_SCHED_PRIO_HIGH,
index 1dc425fef1419c8da4fa60597dacd67827b45862..c7426df2b851b0bcb2d2cc24320e015c08d4394d 100644 (file)
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     // This is up to the applications.
     DWORD p = THREAD_PRIORITY_NORMAL;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = THREAD_PRIORITY_BELOW_NORMAL;  break;
         case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
         case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
         case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
         case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
     }
 
+    if (prio != GGML_SCHED_PRIO_LOW) {
+        // Tell Windows that this thread should not be throttled (needs its own CPU core).
+        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+        // all our threads onto the first 4 cores which results in terrible performance with
+        // n_threads > 4
+        #if _WIN32_WINNT >= 0x0602
+        THREAD_POWER_THROTTLING_STATE t;
+        ZeroMemory(&t, sizeof(t));
+        t.Version     = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+        t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
+        t.StateMask   = 0;
+
+        if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
+            GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
+            return false;
+        }
+        #endif
+    }
+
     if (prio == GGML_SCHED_PRIO_NORMAL) {
         // Keep inherited policy/priority
         return true;
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
+        // TODO: there seems to be no way to set lower prio on Apple platforms
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
         case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_BATCH; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
         case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;