Threadpool: take 2 (#8672)

author Faisal Zaghloul <redacted>

Thu, 29 Aug 2024 23:20:53 +0000 (19:20 -0400)

committer GitHub <redacted>

Thu, 29 Aug 2024 23:20:53 +0000 (01:20 +0200)
author Faisal Zaghloul <redacted>
Thu, 29 Aug 2024 23:20:53 +0000 (19:20 -0400)
committer GitHub <redacted>
Thu, 29 Aug 2024 23:20:53 +0000 (01:20 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index 715adf94658f0e5d5fe1cba93a1172ba475b8f7f..9fa18472512abb63f9e5a68aeaf528fbc8bac204 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
      return cpu_get_num_physical_cores();
  }
  
+// Helper for setting process priority
+
+#if defined(_WIN32)
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    DWORD p = NORMAL_PRIORITY_CLASS;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    int p = 0;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        return false;
+    }
+    return true;
+}
+
+#endif
+
  //
  // CLI argument parsing
  //
@@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
      }
  }
  
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = cpu_get_num_math();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set && n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
  bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
      bool invalid_param = false;
      std::string arg;
@@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
          }
      }
  
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
      if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
          throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
      }
@@ -331,7 +411,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
      get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
      get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
      get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
-    get_env("LLAMA_ARG_THREADS",          params.n_threads);
+    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
      get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
      get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
      get_env("LLAMA_ARG_BATCH",            params.n_batch);
@@ -368,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
      return true;
  }
  
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
  #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
  
  bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@@ -384,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
      }
      if (arg == "-t" || arg == "--threads") {
          CHECK_ARG
-        params.n_threads = std::stoi(argv[i]);
-        if (params.n_threads <= 0) {
-            params.n_threads = std::thread::hardware_concurrency();
+        params.cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams.n_threads <= 0) {
+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
          }
          return true;
      }
+    if (arg == "-C" || arg == "--cpu-mask") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Cr" || arg == "--cpu-range") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio") {
+        CHECK_ARG
+        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict") {
+        CHECK_ARG
+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--poll") {
+        CHECK_ARG
+        params.cpuparams.poll = std::stoul(argv[i]);
+        return true;
+    }
      if (arg == "-tb" || arg == "--threads-batch") {
          CHECK_ARG
-        params.n_threads_batch = std::stoi(argv[i]);
-        if (params.n_threads_batch <= 0) {
-            params.n_threads_batch = std::thread::hardware_concurrency();
+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams_batch.n_threads <= 0) {
+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
          }
          return true;
      }
+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch") {
+        params.cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.poll = std::stoul(argv[i]);
+        return true;
+    }
      if (arg == "-td" || arg == "--threads-draft") {
          CHECK_ARG
-        params.n_threads_draft = std::stoi(argv[i]);
-        if (params.n_threads_draft <= 0) {
-            params.n_threads_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams.n_threads <= 0) {
+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
          }
          return true;
+    }
+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-draft") {
+        params.draft_cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.poll = std::stoul(argv[i]);
+        return true;
      }
      if (arg == "-tbd" || arg == "--threads-batch-draft") {
          CHECK_ARG
-        params.n_threads_batch_draft = std::stoi(argv[i]);
-        if (params.n_threads_batch_draft <= 0) {
-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams_batch.n_threads <= 0) {
+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
          }
          return true;
      }
+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch-draft") {
+        params.draft_cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
+        return true;
+    }
      if (arg == "-p" || arg == "--prompt") {
          CHECK_ARG
          params.prompt = argv[i];
@@ -1498,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
      options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
      options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
      options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
      options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
-                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+
+#ifndef GGML_USE_OPENMP
+    // these options are available only with the internal threadpool
+    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
+
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
+
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
+
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
+                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
+#endif // GGML_USE_OPENMP
+
      options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
      options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
      options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
@@ -1774,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
      options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
      options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
      options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
  
      printf("usage: %s [options]\n", argv[0]);
@@ -1806,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
  std::string gpt_params_get_system_info(const gpt_params & params) {
      std::ostringstream os;
  
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
      }
  #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
      // TODO: windows + arm64 + mingw64
@@ -2332,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
      cparams.n_seq_max         = params.n_parallel;
      cparams.n_batch           = params.n_batch;
      cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
      cparams.seed              = params.seed;
      cparams.logits_all        = params.logits_all;
      cparams.embeddings        = params.embedding;
@@ -2359,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
      return cparams;
  }
  
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
  #ifdef LLAMA_USE_CURL
  
  static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -3348,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
      yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
  
      fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
      fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
      fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
      fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h

index f603ba2be1d35a89dd6d42a80aefed655f0b642e..cb5e7f6df10c530524f779644e670ddb6d4ea9a5 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -67,13 +67,18 @@ enum dimre_method {
      DIMRE_METHOD_MEAN,
  };
  
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
  struct gpt_params {
      uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
  
-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
      int32_t n_predict             =    -1; // new tokens to predict
      int32_t n_ctx                 =     0; // context size
      int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -100,6 +105,11 @@ struct gpt_params {
      int32_t yarn_orig_ctx         =     0; // YaRN original context length
      float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
  
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;
+
      ggml_backend_sched_eval_callback cb_eval = nullptr;
      void * cb_eval_user_data                 = nullptr;
  
@@ -204,7 +214,7 @@ struct gpt_params {
      int32_t port           = 8080;         // server listens on this network port
      int32_t timeout_read   = 600;          // http read timeout in seconds
      int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
  
      std::string hostname      = "127.0.0.1";
      std::string public_path   = "";
@@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
  
  std::string gpt_params_get_system_info(const gpt_params & params);
  
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
+
  //
  // String utils
  //
@@ -327,8 +342,9 @@ struct llama_init_result {
  
  struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
  
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
  
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp

index aca332e9464d2f7eaf461b9d2a374f2fac0b51f2..3ce91070b4ed72e5ee4823e635192a17c6a85261 100644 (file)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
  #endif
  
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
  
      if (plan.work_size > 0) {
          buf.resize(plan.work_size);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp

index 47cb16c69d536a03b30e5edce04b513bdd9dce31..97622f4f4fd185ad7548ce3b19e31beda10a839c 100644 (file)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
  #endif
  
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
  
      if (plan.work_size > 0) {
          buf.resize(plan.work_size);
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
  #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
  
  struct benchmark_params_struct {
-    int32_t n_threads     = 1;
+    int     n_threads     = 1;
      int32_t n_iterations  = 10;
  };
  
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp

index 8fa492571aa445a4c32b74dead4cf1c42d9815e7..a68268388389df7c061d8a1abe25bb0b864f7363 100644 (file)
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
      if (use_pca) {
          // run PCA
          PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
          pca_params.n_iterations = params.n_pca_iterations;
          PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
      } else {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp

index c7e5ca78845ee243c6fcbcb8fdb73551f2c685c3..8df457e21949396417fc12885c1f0b82564e3b18 100644 (file)
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
  
      g_verbose = (params.verbosity == 1);
      try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
          ctx.run_merge();
      } catch (const std::exception & err) {
          fprintf(stderr, "%s\n", err.what());
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp

index 42918bfc79f2240ba97a744367ee56101d14a7f5..8edadef909f42e2d7ab1edb078f95d0db862e58d 100644 (file)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -16,6 +16,7 @@
  #include <sstream>
  #include <string>
  #include <vector>
+#include <thread>
  
  #include "ggml.h"
  #include "llama.h"
@@ -225,6 +226,9 @@ struct cmd_params {
      std::vector<ggml_type> type_k;
      std::vector<ggml_type> type_v;
      std::vector<int> n_threads;
+    std::vector<std::string> cpu_mask;
+    std::vector<bool> cpu_strict;
+    std::vector<int> poll;
      std::vector<int> n_gpu_layers;
      std::vector<std::string> rpc_servers;
      std::vector<llama_split_mode> split_mode;
@@ -236,6 +240,8 @@ struct cmd_params {
      std::vector<bool> embeddings;
      ggml_numa_strategy numa;
      int reps;
+    ggml_sched_priority prio;
+    int delay;
      bool verbose;
      output_formats output_format;
      output_formats output_format_stderr;
@@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
      /* type_k               */ {GGML_TYPE_F16},
      /* type_v               */ {GGML_TYPE_F16},
      /* n_threads            */ {cpu_get_num_math()},
+    /* cpu_mask             */ {"0x0"},
+    /* cpu_strict           */ {false},
+    /* poll                 */ {50},
      /* n_gpu_layers         */ {99},
      /* rpc_servers          */ {""},
      /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
      /* embeddings           */ {false},
      /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
      /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
      /* verbose              */ false,
      /* output_format        */ MARKDOWN,
      /* output_format_stderr */ NONE,
@@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
      printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
      printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
      printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>            (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                  (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                    (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
      printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
      printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
      printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
      printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
      printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
      printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)           (default: %d)\n", cmd_params_defaults.delay);
      printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
      printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
      printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
      params.output_format_stderr = cmd_params_defaults.output_format_stderr;
      params.reps = cmd_params_defaults.reps;
      params.numa = cmd_params_defaults.numa;
+    params.prio = cmd_params_defaults.prio;
+    params.delay = cmd_params_defaults.delay;
  
      for (int i = 1; i < argc; i++) {
          arg = argv[i];
@@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
              }
              auto p = string_split<int>(argv[i], split_delim);
              params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
          } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
              if (++i >= argc) {
                  invalid_param = true;
@@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                  break;
              }
              params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
          } else if (arg == "-o" || arg == "--output") {
              if (++i >= argc) {
                  invalid_param = true;
@@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
      if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
      if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
      if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
+    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
+    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }
  
      return params;
  }
@@ -598,6 +652,9 @@ struct cmd_params_instance {
      ggml_type type_k;
      ggml_type type_v;
      int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
      int n_gpu_layers;
      std::string rpc_servers;
      llama_split_mode split_mode;
@@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
      for (const auto & tv : params.type_v)
      for (const auto & nkvo : params.no_kv_offload)
      for (const auto & fa : params.flash_attn)
-    for (const auto & nt : params.n_threads) {
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & pl : params.poll) {
          for (const auto & n_prompt : params.n_prompt) {
              if (n_prompt == 0) {
                  continue;
@@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .type_k       = */ tk,
                  /* .type_v       = */ tv,
                  /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                  /* .n_gpu_layers = */ nl,
                  /* .rpc_servers  = */ rpc,
                  /* .split_mode   = */ sm,
@@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .type_k       = */ tk,
                  /* .type_v       = */ tv,
                  /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                  /* .n_gpu_layers = */ nl,
                  /* .rpc_servers  = */ rpc,
                  /* .split_mode   = */ sm,
@@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                  /* .type_k       = */ tk,
                  /* .type_v       = */ tv,
                  /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                  /* .n_gpu_layers = */ nl,
                  /* .rpc_servers  = */ rpc,
                  /* .split_mode   = */ sm,
@@ -769,6 +838,9 @@ struct test {
      int n_batch;
      int n_ubatch;
      int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
      bool has_rpc;
      ggml_type type_k;
      ggml_type type_v;
@@ -795,6 +867,9 @@ struct test {
          n_batch = inst.n_batch;
          n_ubatch = inst.n_ubatch;
          n_threads = inst.n_threads;
+        cpu_mask = inst.cpu_mask;
+        cpu_strict = inst.cpu_strict;
+        poll = inst.poll;
          has_rpc = !inst.rpc_servers.empty();
          type_k = inst.type_k;
          type_v = inst.type_v;
@@ -872,13 +947,14 @@ struct test {
              "cpu_info", "gpu_info",
              "model_filename", "model_type", "model_size", "model_n_params",
              "n_batch", "n_ubatch",
-            "n_threads", "type_k", "type_v",
+            "n_threads", "cpu_mask", "cpu_strict", "poll",
+            "type_k", "type_v",
              "n_gpu_layers", "split_mode",
              "main_gpu", "no_kv_offload", "flash_attn",
              "tensor_split", "use_mmap", "embeddings",
              "n_prompt", "n_gen", "test_time",
              "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts"
+            "avg_ts", "stddev_ts",
          };
          return fields;
      }
@@ -887,7 +963,7 @@ struct test {
  
      static field_type get_field_type(const std::string & field) {
          if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
-            field == "n_threads" ||
+            field == "n_threads" || field == "poll" ||
              field == "model_size" || field == "model_n_params" ||
              field == "n_gpu_layers" || field == "main_gpu" ||
              field == "n_prompt" || field == "n_gen" ||
@@ -896,6 +972,7 @@ struct test {
          }
          if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
              field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+            field == "cpu_strict" ||
              field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
              return BOOL;
          }
@@ -928,7 +1005,8 @@ struct test {
              cpu_info, gpu_info,
              model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
              std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
+            ggml_type_name(type_k), ggml_type_name(type_v),
              std::to_string(n_gpu_layers), split_mode_str(split_mode),
              std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
              tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
              return -30;
          }
          if (field == "t/s") {
-            return 16;
+            return 20;
          }
          if (field == "size" || field == "params") {
              return 10;
@@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
          if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
              fields.emplace_back("n_threads");
          }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
          if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
              fields.emplace_back("n_batch");
          }
@@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
      llama_backend_init();
      llama_numa_init(params.numa);
  
+    set_process_priority(params.prio);
+
      // initialize printer
      std::unique_ptr<printer> p = create_printer(params.output_format);
      std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
  
          llama_kv_cache_clear(ctx);
  
+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            exit(1);
+        }
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
+
+        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+        if (!threadpool) {
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
+        llama_attach_threadpool(ctx, threadpool, NULL);
+
          // warmup run
          if (t.n_prompt > 0) {
              //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
@@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
          llama_print_timings(ctx);
  
          llama_free(ctx);
+
+        ggml_threadpool_free(threadpool);
      }
  
      llama_free_model(lmodel);
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

index 58c32ca533bb10039055cf8135502a41c9287361..48b7840ae49c3ade861aed464cdcbc8bce8996cb 100644 (file)
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -71,8 +71,8 @@ actor LlamaContext {
          var ctx_params = llama_context_default_params()
          ctx_params.seed  = 1234
          ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)
  
          let context = llama_new_context_with_model(model, ctx_params)
          guard let context else {
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp

index 8c7dd2ae3d0dc3b97c06575a71d540d0881daa27..86b39f20eea6e10828f7d3f59a2a2b207dba6711 100644 (file)
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
          if (!params->image.empty()) {
              LOG_TEE("using base64 encoded image instead of command line image path\n");
          }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
          if (!embed) {
              LOG_TEE("%s: can't load image from prompt\n", __func__);
              return NULL;
          }
          params->prompt = remove_image_from_prompt(prompt);
      } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
          if (!embed) {
              fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
              return NULL;
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp

index 379fc295f11018d8b96b6ca21502e57f546dc973..f500ea5b944f47e3931a1163dad02cda3907243c 100644 (file)
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
  
  static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
      auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
      if (!embeds) {
          std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
          return NULL;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 4a342ad031663b38e4563db8bfb524a7422fcb92..2c05afb048c7be71ad4ffe6c3778b0b6478feecc 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int) params.cpuparams.n_threads
+    );
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    set_process_priority(params.cpuparams.priority);
+
+    struct ggml_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
+        }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+    if (ctx_guidance) {
+        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
+    }
+
      const int n_ctx_train = llama_n_ctx_train(model);
      const int n_ctx = llama_n_ctx(ctx);
      LOG("n_ctx: %d\n", n_ctx);
@@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
      llama_sampling_free(ctx_sampling);
      llama_backend_free();
  
+    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free(threadpool_batch);
+
  #ifndef LOG_DISABLE_LOGS
      LOG_TEE("Log end\n");
  #endif // LOG_DISABLE_LOGS
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index c37182fe4742bd49d92192eca0a9b106559e83f0..cc938e80d6a6d3da4922393c229c1aad1343564e 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
      });
  
      LOG_INFO("system info", {
-        {"n_threads",       params.n_threads},
-        {"n_threads_batch", params.n_threads_batch},
+        {"n_threads",       params.cpuparams.n_threads},
+        {"n_threads_batch", params.cpuparams_batch.n_threads},
          {"total_threads",   std::thread::hardware_concurrency()},
          {"system_info",     llama_print_system_info()},
      });
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp

index b051a18f169c2660013e35037c6e3ce3d86f465e..1616edecbbef6d3d8e6f18b5b3b43f8a21eaefb6 100644 (file)
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
      // load the draft model
      params.model = params.model_draft;
      params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
      }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
      llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
      model_dft = llama_init_dft.model;
      ctx_dft = llama_init_dft.context;
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h

index 434c13b34a929c43e959565acae84b97ae4fd066..0dff47d65cf86cb5409bf035d026561f06fbbe50 100644 (file)
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,8 @@ extern "C" {
  #endif
  
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
  
  // Tensor allocator
  struct ggml_tallocr {
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h

index e73b9a7452feda5b8a2e0c041e5f2244f81bb8fd..e497b6d02388a14ed28aa15fbbb7bdf8b9f85d62 100644 (file)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -103,6 +103,7 @@ extern "C" {
  
      GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
      GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
      GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
  
      // Create a backend buffer from an existing pointer
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index b11d047aeda7d057f74f9b606e8d2f837e44a16c..5233a9995b62991a7da8937aaa942a86a19cb231 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -231,6 +231,8 @@
  #define GGML_MAX_SRC            10
  #ifndef GGML_MAX_NAME
  #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
  #endif
  #define GGML_MAX_OP_PARAMS      64
  #define GGML_DEFAULT_N_THREADS  4
@@ -628,6 +630,29 @@ extern "C" {
      // If it returns true, the computation is aborted
      typedef bool (*ggml_abort_callback)(void * data);
  
+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct  ggml_threadpool * ggml_threadpool_t;
+
      // the compute plan that needs to be prepared for ggml_graph_compute()
      // since https://github.com/ggerganov/ggml/issues/287
      struct ggml_cplan {
@@ -635,6 +660,7 @@ extern "C" {
          uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
  
          int n_threads;
+        struct ggml_threadpool * threadpool;
  
          // abort ggml_graph_compute when true
          ggml_abort_callback abort_callback;
@@ -2057,10 +2083,23 @@ extern "C" {
      GGML_API size_t ggml_graph_overhead(void);
      GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
  
+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
+    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
+    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+
      // ggml_graph_plan() has to be called before ggml_graph_compute()
      // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
      // same as ggml_graph_compute() but the work data is allocated as a part of the context
      // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
      GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt

index ff84b9bb5f0f2c360df7713e20a11c6c9fcbca96..ec7d308253b5935e85cf5d9c9678c19cac310e41 100644 (file)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1247,7 +1247,7 @@ endif()
  
  # Data types, macros and functions related to controlling CPU affinity and
  # some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
      add_compile_definitions(_GNU_SOURCE)
  endif()
  
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c

index 8856967c911042428d4aac20c8c73eae0450c454..5b877db3566e764b5ab4add369378190df2a3b53 100644 (file)
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
  #endif
  
  struct ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    void *              work_data;
+    size_t              work_size;
  
      ggml_abort_callback abort_callback;
      void *              abort_callback_data;
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
  
      struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
  
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
      cpu_plan->cgraph = *cgraph; // FIXME: deep copy
  
      if (cpu_plan->cplan.work_size > 0) {
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
  GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
      struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
  
      if (cpu_ctx->work_size < cplan.work_size) {
          free(cpu_ctx->work_data);
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
      }
  
      ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
      ctx->work_data           = NULL;
      ctx->work_size           = 0;
      ctx->abort_callback      = NULL;
@@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
      ctx->n_threads = n_threads;
  }
  
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
  void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
      GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
  
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 9c105fd353de4c12ab189fc7f2601d7e3ae8238d..dc6cdca0bd8f65129afb8e663473b47b10cb9697 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -69,23 +69,42 @@ int ggml_sve_cnt_b = 0;
  #endif
  #include <windows.h>
  
+#if !defined(__clang__)
  typedef volatile LONG atomic_int;
  typedef atomic_int atomic_bool;
  typedef atomic_int atomic_flag;
  
  #define ATOMIC_FLAG_INIT 0
  
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
  static void atomic_store(atomic_int * ptr, LONG val) {
      InterlockedExchange(ptr, val);
  }
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
  static LONG atomic_load(atomic_int * ptr) {
      return InterlockedCompareExchange(ptr, 0, 0);
  }
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
  static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
      return InterlockedExchangeAdd(ptr, inc);
  }
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
  }
  static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
      return InterlockedExchange(ptr, 1);
@@ -93,6 +112,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
  static void atomic_flag_clear(atomic_flag * ptr) {
      InterlockedExchange(ptr, 0);
  }
+#else // clang
+#include <stdatomic.h>
+#endif
  
  typedef HANDLE pthread_t;
  
@@ -121,8 +143,10 @@ static int sched_yield (void) {
      return 0;
  }
  #else
+
  #include <pthread.h>
  #include <stdatomic.h>
+#include <sched.h>
  
  typedef void * thread_ret_t;
  
@@ -1868,28 +1892,102 @@ struct ggml_context_container {
      struct ggml_context context;
  };
  
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan * cplan;
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
  
-    int n_threads;
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
+
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
  
      // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
      atomic_int n_barrier;
      atomic_int n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
  
-    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
-    void * abort_callback_data;
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
  
-    atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
+    struct ggml_compute_state * workers;   // per thread state
+    int          n_threads_max; // number of threads in the pool
+    int          n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;        // Scheduling priority
+    uint32_t     poll;        // Polling level (0 - no polling)
  
      enum ggml_status ec;
  };
  
+// Per-thread state
  struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
      ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    int  last_graph;
+    bool pending;
+#endif
+    struct ggml_threadpool * threadpool;
      int ith;
-    struct ggml_compute_state_shared * shared;
  };
  
  struct ggml_compute_params {
@@ -1900,7 +1998,7 @@ struct ggml_compute_params {
      size_t wsize;
      void * wdata;
  
-    struct ggml_compute_state_shared * shared;
+    struct ggml_threadpool * threadpool;
  };
  
  //
@@ -2971,6 +3069,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
  
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void ggml_thread_cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void ggml_thread_cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void ggml_thread_cpu_relax(void) {;}
+#endif
+
  //
  // NUMA support
  //
@@ -3018,42 +3129,36 @@ inline static void ggml_critical_section_start(void) {
  }
  
  #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
          return;
      }
  
      #pragma omp barrier
  }
  #else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
          return;
      }
  
-    atomic_int * n_barrier = &shared->n_barrier;
-    atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+    atomic_int * n_barrier = &threadpool->n_barrier;
+    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
  
-    int n_threads = shared->n_threads;
-    int passed_old = atomic_load(n_barrier_passed);
+    int n_threads = threadpool->n_threads_cur;
+    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
  
      if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
          // last thread
          atomic_store(n_barrier, 0);
-        atomic_fetch_add(n_barrier_passed, 1);
+        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
      } else {
          // wait for other threads
-        const int n_spin_before_sleep = 100000;
          while (true) {
-            for (int i = 0; i < n_spin_before_sleep; i++) {
-                if (atomic_load(n_barrier_passed) != passed_old) {
-                    return;
-                }
-            #if defined(__SSE3__)
-                _mm_pause();
-            #endif
+            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+                return;
              }
-            sched_yield();
+            ggml_thread_cpu_relax();
          }
      }
  }
@@ -10148,7 +10253,7 @@ static void ggml_compute_forward_acc_f32(
                  ((char *) src0->data),
                  ggml_nbytes(dst));
          }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
      }
  
      const int ith = params->ith;
@@ -12622,10 +12727,10 @@ UseGgmlGemm1:;
  
      if (ith == 0) {
          // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->shared->current_chunk, nth);
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
      }
  
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
  #if GGML_USE_LLAMAFILE
      if (src1->type != vec_dot_type) {
@@ -12733,7 +12838,7 @@ UseGgmlGemm2:;
              break;
          }
  
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
      }
  }
  
@@ -12828,7 +12933,7 @@ static void ggml_compute_forward_mul_mat_id(
          }
      }
  
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      // compute each matrix multiplication in sequence
      for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -12982,7 +13087,7 @@ static void ggml_compute_forward_out_prod_f32(
      if (ith == 0) {
          ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      // dst[:,:,:,:] = 0
      // for i2,i3:
@@ -13100,7 +13205,7 @@ static void ggml_compute_forward_out_prod_q_f32(
      if (ith == 0) {
          ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      // parallelize by last three dimensions
  
@@ -13286,7 +13391,7 @@ static void ggml_compute_forward_set_f32(
                  ((char *) src0->data),
                  ggml_nbytes(dst));
          }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
      }
  
      const int ith = params->ith;
@@ -13865,7 +13970,7 @@ static void ggml_compute_forward_diag_mask_f32(
                  ((char *) src0->data),
                  ggml_nbytes(dst));
          }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
      }
  
      // TODO: handle transposed/permuted matrices
@@ -14641,7 +14746,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
          // need to zero dst since we are accumulating into it
          memset(dst->data, 0, ggml_nbytes(dst));
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
  
@@ -14729,7 +14834,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
          // need to zero dst since we are accumulating into it
          memset(dst->data, 0, ggml_nbytes(dst));
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
  
@@ -15109,7 +15214,7 @@ static void ggml_compute_forward_conv_transpose_2d(
  
          memset(dst->data, 0, ggml_nbytes(dst));
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      const int32_t stride = ggml_get_op_params_i32(dst, 0);
  
@@ -15977,7 +16082,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
      if (ith == 0) {
          memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      const int64_t elem_q = ggml_nelements(q);
      const int64_t elem_k = ggml_nelements(k);
@@ -16668,7 +16773,7 @@ static void ggml_compute_forward_add_rel_pos_f32(
          if (params->ith == 0) {
              memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
          }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
      }
      // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
  
@@ -16953,7 +17058,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
      if (ith == 0) {
          memset(sums, 0, sizeof(float) * (nth + nth * nc));
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      // rows per thread
      const int dr = (nr + nth - 1)/nth;
@@ -16994,7 +17099,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
          }
  #endif
      }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
  
      if (ith == 0) {
          float * dp = (float *) dst->data;
@@ -18810,65 +18915,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
      ggml_hash_set_reset(&cgraph->visited_hash_set);
  }
  
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
  // Android's libc implementation "bionic" does not support setting affinity
  #if defined(__gnu_linux__)
  static void set_numa_thread_affinity(int thread_n) {
@@ -19149,9 +19195,268 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
      return n_tasks;
  }
  
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+bool ggml_thread_apply_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+    // This is up to the applications.
+    DWORD p = THREAD_PRIORITY_NORMAL;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    if (!SetThreadPriority(GetCurrentThread(), p)) {
+        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    // Not supported on Apple platforms
+    UNUSED(mask);
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#else // posix?
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#endif
+
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
+    }
+    return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
+    if (!threadpool) return;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+    const int n_threads = threadpool->n_threads_max;
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    threadpool->stop = true;
+    threadpool->pause = false;
+
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+
+    for (int j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    ggml_mutex_destroy(&threadpool->mutex);
+    ggml_cond_destroy(&threadpool->cond);
+#endif // GGML_USE_OPENMP
+
+    GGML_ALIGNED_FREE(threadpool->workers);
+    GGML_ALIGNED_FREE(threadpool);
+}
+
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
+void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (!threadpool->pause) {
+       ggml_threadpool_pause_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (threadpool->pause) {
+       ggml_threadpool_resume_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                           int       n_threads,
+    struct ggml_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+    }
      if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
      }
  
      size_t work_size = 0;
@@ -19307,12 +19612,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
      }
  
      if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+        work_size += CACHE_LINE_SIZE*(n_threads);
      }
  
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
  
      return cplan;
  }
@@ -19320,17 +19626,17 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
  static thread_ret_t ggml_graph_compute_thread(void * data) {
      struct ggml_compute_state * state = (struct ggml_compute_state *) data;
  
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
+    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
+    const struct ggml_cplan  * cplan  = state->threadpool->cplan;
  
      set_numa_thread_affinity(state->ith);
  
      struct ggml_compute_params params = {
-        /*.ith   =*/ state->ith,
-        /*.nth   =*/ state->shared->n_threads,
-        /*.wsize =*/ cplan->work_size,
-        /*.wdata =*/ cplan->work_data,
-        /*.shared=*/ state->shared,
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ state->threadpool->n_threads_cur,
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ state->threadpool,
      };
  
      for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@@ -19339,12 +19645,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
          ggml_compute_forward(&params, node);
  
          if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->ec = GGML_STATUS_ABORTED;
+            state->threadpool->ec = GGML_STATUS_ABORTED;
          }
  
-        ggml_barrier(state->shared);
+        ggml_barrier(state->threadpool);
  
-        if (state->shared->ec != GGML_STATUS_SUCCESS) {
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
              break;
          }
      }
@@ -19352,24 +19658,243 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
      return 0;
  }
  
+#ifndef GGML_USE_OPENMP
+
+static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
+
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = (state->ith < threadpool->n_threads_cur);
+        state->last_graph = new_graph;
+    }
+
+    return state->pending;
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+        // No new work. Keep polling.
+        ggml_thread_cpu_relax();
+    }
+
+    return state->pending;
+}
+
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (ggml_graph_compute_poll_for_work(state)) {
+        return state->pending;
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+    }
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return state->pending;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    ggml_thread_apply_priority(threadpool->prio);
+    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
+        ggml_thread_apply_affinity(state->cpumask);
+    }
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+
+            ggml_graph_compute_thread(state);
+        }
+    }
+
+    return (thread_ret_t) 0;
+}
+
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
+{
+    // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+
+    if (threadpool->pause) {
+       // Update main thread prio and affinity to match the threadpool settings
+       ggml_thread_apply_priority(threadpool->prio);
+       if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+           ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+       }
+
+       // resume does cond broadcast
+       ggml_threadpool_resume_locked(threadpool);
+    } else {
+       ggml_cond_broadcast(&threadpool->cond);
+    }
+
+    ggml_mutex_unlock(&threadpool->mutex);
+}
+
+#endif // GGML_USE_OPENMP
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
+
+static struct ggml_threadpool * ggml_threadpool_new_impl(
+    struct ggml_threadpool_params * tpp,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_threadpool * threadpool =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = tpp->paused;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+    // Allocate and init workers state
+    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
+    struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
+
+    memset(workers, 0, workers_size);
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j].threadpool = threadpool;
+        workers[j].ith        = j;
+    }
+
+    threadpool->workers = workers;
+
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
+
+    // Spin the threads for all workers, and update CPU placements.
+    // Place the main thread last (towards the higher numbered CPU cores).
+
+    int32_t cpumask_iter = 0;
+
+    for (int j = 1; j < tpp->n_threads; j++) {
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+        int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
+        GGML_ASSERT(rc == 0);
+    }
+
+    ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+    if (!threadpool->pause) {
+        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+        ggml_thread_apply_priority(threadpool->prio);
+        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
+    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+}
+
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
      GGML_ASSERT(cplan);
      GGML_ASSERT(cplan->n_threads > 0);
      GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
  
-    int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.n_threads               =*/ n_threads,
-        /*.n_barrier               =*/ 0,
-        /*.n_barrier_passed        =*/ 0,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-        /*.current_chunk           =*/ 0,
-        /*.ec                      =*/ GGML_STATUS_SUCCESS,
-    };
+    int n_threads                               = cplan->n_threads;
+    struct ggml_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+    } else {
+        // Reset some of the parameters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_threads_cur    = n_threads;
+        threadpool->current_chunk    = 0;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
  
  #ifdef GGML_USE_OPENMP
      if (n_threads > 1) {
@@ -19379,63 +19904,36 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
              {
                  // update the number of threads from the actual number of threads that we got from OpenMP
                  n_threads = omp_get_num_threads();
-                state_shared.n_threads = n_threads;
+                threadpool->n_threads_cur = n_threads;
              }
  
-            struct ggml_compute_state worker = {
-                .thrd   = 0,
-                .ith    = omp_get_thread_num(),
-                .shared = &state_shared,
-            };
-            ggml_graph_compute_thread(&worker);
+            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
          }
      } else {
-        struct ggml_compute_state worker = {
-            .thrd   = 0,
-            .ith    = 0,
-            .shared = &state_shared,
-        };
-        ggml_graph_compute_thread(&worker);
+        ggml_graph_compute_thread(&threadpool->workers[0]);
      }
  #else
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    for (int j = 0; j < n_threads; ++j) {
-        workers[j] = (struct ggml_compute_state) {
-            .thrd   = 0,
-            .ith    = j,
-            .shared = &state_shared,
-        };
-    }
-
-    // create thread pool
-    for (int j = 1; j < n_threads; ++j) {
-        const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-        GGML_ASSERT(rc == 0);
-        UNUSED(rc);
-    }
-
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    // Kick all threads to start the new graph
+    ggml_graph_compute_kickoff(threadpool);
  
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
+    // This is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
  #endif
  
      // don't leave affinity set on the main thread
      clear_numa_thread_affinity();
  
-    return state_shared.ec;
+    enum ggml_status ret = threadpool->ec;
+
+    if (disposable_threadpool) {
+        ggml_threadpool_free(threadpool);
+    }
+
+    return ret;
  }
  
  enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
  
      struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
  
@@ -20251,7 +20749,7 @@ static enum ggml_opt_result ggml_opt_adam(
  
      float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
  
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
      struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
      cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
  
@@ -20598,7 +21096,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
          opt->iter = iter;
      }
  
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
      struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
      cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
  
diff --git a/include/llama.h b/include/llama.h

index 6cca6320b347d860246d30d34f174b0f5a0affa6..c3bda9e02bb2163ea0a2c06132da2d4cbbc7e0cd 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -304,8 +304,8 @@ extern "C" {
          uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
          uint32_t n_ubatch;          // physical maximum batch size
          uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        uint32_t n_threads;         // number of threads to use for generation
-        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        int32_t  n_threads;         // number of threads to use for generation
+        int32_t  n_threads_batch;   // number of threads to use for batch processing
  
          enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
          enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
@@ -428,6 +428,13 @@ extern "C" {
      //optional:
      LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
  
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    LLAMA_API void llama_attach_threadpool(
+               struct   llama_context * ctx,
+            ggml_threadpool_t   threadpool,
+            ggml_threadpool_t   threadpool_batch);
+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+
      // Call once at the end of the program - currently only used for MPI
      LLAMA_API void llama_backend_free(void);
  
@@ -837,13 +844,13 @@ extern "C" {
      // Set the number of threads used for decoding
      // n_threads is the number of threads used for generation (single token)
      // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
  
      // Get the number of threads used for generation of a single token.
-    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
  
      // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
  
      // Set whether the model is in embeddings mode or not
      // If true, embeddings will be returned but logits will not
diff --git a/src/llama.cpp b/src/llama.cpp

index 8d5f24783d6aba1d5fad15718f74c4586aaf1410..2274296b45406366bbef2a686aeffeb3dd3de8ba 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2373,8 +2373,8 @@ struct llama_cparams {
      uint32_t n_batch;
      uint32_t n_ubatch;
      uint32_t n_seq_max;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
  
      float rope_freq_base;
      float rope_freq_scale;
@@ -3091,6 +3091,9 @@ struct llama_context {
  #endif
      ggml_backend_t backend_cpu = nullptr;
  
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
      bool has_evaluated_once = false;
  
      int64_t t_start_us;
@@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
  }
  
  static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
+          llama_context & lctx,
+            ggml_cgraph * gf,
+                    int   n_threads,
+        ggml_threadpool * threadpool) {
  #ifdef GGML_USE_METAL
      if (ggml_backend_is_metal(lctx.backend_metal)) {
          ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15505,6 +15509,7 @@ static void llama_graph_compute(
  
      if (lctx.backend_cpu != nullptr) {
          ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
          ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
      }
  #ifdef GGML_USE_BLAS
@@ -15625,6 +15630,8 @@ static int llama_decode_internal(
          }
  
          int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
          GGML_ASSERT(n_threads > 0);
  
          // non-causal masks do not use the KV cache
@@ -15686,7 +15693,7 @@ static int llama_decode_internal(
  
          llama_set_inputs(lctx, ubatch);
  
-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);
  
          // update the kv ring buffer
          {
@@ -15863,7 +15870,9 @@ static int llama_encode_internal(
      lctx.inp_embd_enc = NULL;
      lctx.n_outputs = n_tokens;
  
-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
      GGML_ASSERT(n_threads > 0);
  
      ggml_backend_sched_reset(lctx.sched);
@@ -15895,7 +15904,7 @@ static int llama_encode_internal(
  
      llama_set_inputs(lctx, ubatch);
  
-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);
  
      // extract embeddings
      if (embd) {
@@ -16177,7 +16186,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
  
      ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
  
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  #endif
  
      //const int64_t t_end = ggml_time_us();
@@ -16203,7 +16212,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
  
              llama_set_k_shift(lctx);
  
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  
              need_reserve = true;
          }
@@ -17451,6 +17460,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
      }
  }
  
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_threadpool_t   threadpool,
+        ggml_threadpool_t   threadpool_batch) {
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
+}
+
  void llama_backend_free(void) {
      ggml_quantize_free();
  }
@@ -19367,16 +19389,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
      }
  }
  
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
      ctx->cparams.n_threads       = n_threads;
      ctx->cparams.n_threads_batch = n_threads_batch;
  }
  
-uint32_t llama_n_threads(struct llama_context * ctx) {
+int32_t llama_n_threads(struct llama_context * ctx) {
      return ctx->cparams.n_threads;
  }
  
-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
      return ctx->cparams.n_threads_batch;
  }
  
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp

index 8159e276af617bb8f6b584fdef4fbd701f035726..246bb227d1e19999d0084bddce3a3a0db0daff1d 100644 (file)
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
  }
  
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
  
      if (plan.work_size > 0) {
          buf.resize(plan.work_size);
author	Faisal Zaghloul <redacted>
	Thu, 29 Aug 2024 23:20:53 +0000 (19:20 -0400)
committer	GitHub <redacted>
	Thu, 29 Aug 2024 23:20:53 +0000 (01:20 +0200)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/baby-llama/baby-llama.cpp		patch \| blob \| history
examples/benchmark/benchmark-matmult.cpp		patch \| blob \| history
examples/cvector-generator/cvector-generator.cpp		patch \| blob \| history
examples/export-lora/export-lora.cpp		patch \| blob \| history
examples/llama-bench/llama-bench.cpp		patch \| blob \| history
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift		patch \| blob \| history
examples/llava/llava-cli.cpp		patch \| blob \| history
examples/llava/minicpmv-cli.cpp		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/speculative/speculative.cpp		patch \| blob \| history
ggml/include/ggml-alloc.h		patch \| blob \| history
ggml/include/ggml-backend.h		patch \| blob \| history
ggml/include/ggml.h		patch \| blob \| history
ggml/src/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-backend.c		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama.cpp		patch \| blob \| history
tests/test-rope.cpp		patch \| blob \| history