return cpu_get_num_physical_cores();
}
+// Helper for setting process priority
+
+#if defined(_WIN32)
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
+ return true;
+ }
+
+ DWORD p = NORMAL_PRIORITY_CLASS;
+ switch (prio) {
+ case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
+ case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+ case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
+ case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
+ }
+
+ if (!SetPriorityClass(GetCurrentProcess(), p)) {
+ fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+ return false;
+ }
+
+ return true;
+}
+
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
+ return true;
+ }
+
+ int p = 0;
+ switch (prio) {
+ case GGML_SCHED_PRIO_NORMAL: p = 0; break;
+ case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
+ case GGML_SCHED_PRIO_HIGH: p = -10; break;
+ case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+ }
+
+ if (!setpriority(PRIO_PROCESS, 0, p)) {
+ fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+ return false;
+ }
+ return true;
+}
+
+#endif
+
//
// CLI argument parsing
//
}
}
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+ int32_t n_set = 0;
+
+ if (cpuparams.n_threads < 0) {
+ // Assuming everything about cpuparams is invalid
+ if (role_model != nullptr) {
+ cpuparams = *role_model;
+ } else {
+ cpuparams.n_threads = cpu_get_num_math();
+ }
+ }
+
+ for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+ if (cpuparams.cpumask[i]) {
+ n_set++;
+ }
+ }
+
+ if (n_set && n_set < cpuparams.n_threads) {
+ // Not enough set bits, may experience performance issues.
+ fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+ }
+}
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
}
}
+ postprocess_cpu_params(params.cpuparams, nullptr);
+ postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
+ postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
+ postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
+
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
- get_env("LLAMA_ARG_THREADS", params.n_threads);
+ get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
return true;
}
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+ size_t dash_loc = range.find('-');
+ if (dash_loc == std::string::npos) {
+ fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+ return false;
+ }
+
+ size_t start_i;
+ size_t end_i;
+
+ if (dash_loc == 0) {
+ start_i = 0;
+ } else {
+ start_i = std::stoull(range.substr(0, dash_loc));
+ if (start_i >= GGML_MAX_N_THREADS) {
+ fprintf(stderr, "Start index out of bounds!\n");
+ return false;
+ }
+ }
+
+ if (dash_loc == range.length() - 1) {
+ end_i = GGML_MAX_N_THREADS - 1;
+ } else {
+ end_i = std::stoull(range.substr(dash_loc + 1));
+ if (end_i >= GGML_MAX_N_THREADS) {
+ fprintf(stderr, "End index out of bounds!\n");
+ return false;
+ }
+ }
+
+ for (size_t i = start_i; i <= end_i; i++) {
+ boolmask[i] = true;
+ }
+
+ return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+ // Discard potential 0x prefix
+ size_t start_i = 0;
+ if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+ start_i = 2;
+ }
+
+ size_t num_digits = mask.length() - start_i;
+ if (num_digits > 128) num_digits = 128;
+
+ size_t end_i = num_digits + start_i;
+
+ for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+ char c = mask.at(i);
+ int8_t id = c;
+
+ if ((c >= '0' && c <= '9')) {
+ id -= '0';
+ } else if (c >= 'a' && c <= 'f') {
+ id -= 'a' - 10;
+ } else if (c >= 'A' && c <= 'F') {
+ id -= 'A' - 10;
+ } else {
+ fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+ return false;
+ }
+
+ boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
+ boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+ boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+ boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+ }
+
+ return true;
+}
+
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
}
if (arg == "-t" || arg == "--threads") {
CHECK_ARG
- params.n_threads = std::stoi(argv[i]);
- if (params.n_threads <= 0) {
- params.n_threads = std::thread::hardware_concurrency();
+ params.cpuparams.n_threads = std::stoi(argv[i]);
+ if (params.cpuparams.n_threads <= 0) {
+ params.cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
+ if (arg == "-C" || arg == "--cpu-mask") {
+ CHECK_ARG
+ std::string mask = argv[i];
+ params.cpuparams.mask_valid = true;
+ invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+ return true;
+ }
+ if (arg == "-Cr" || arg == "--cpu-range") {
+ CHECK_ARG
+ std::string range = argv[i];
+ params.cpuparams.mask_valid = true;
+ invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+ return true;
+ }
+ if (arg == "--prio") {
+ CHECK_ARG
+ params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "--cpu-strict") {
+ CHECK_ARG
+ params.cpuparams.strict_cpu = std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "--poll") {
+ CHECK_ARG
+ params.cpuparams.poll = std::stoul(argv[i]);
+ return true;
+ }
if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG
- params.n_threads_batch = std::stoi(argv[i]);
- if (params.n_threads_batch <= 0) {
- params.n_threads_batch = std::thread::hardware_concurrency();
+ params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+ if (params.cpuparams_batch.n_threads <= 0) {
+ params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
+ if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+ CHECK_ARG
+ std::string mask = argv[i];
+ params.cpuparams_batch.mask_valid = true;
+ invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+ return true;
+ }
+ if (arg == "-Crb" || arg == "--cpu-range_batch") {
+ CHECK_ARG
+ std::string range = argv[i];
+ params.cpuparams_batch.mask_valid = true;
+ invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+ return true;
+ }
+ if (arg == "--prio-batch") {
+ CHECK_ARG
+ params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "--cpu-strict-batch") {
+ params.cpuparams_batch.strict_cpu = true;
+ return true;
+ }
+ if (arg == "--poll-batch") {
+ CHECK_ARG
+ params.cpuparams_batch.poll = std::stoul(argv[i]);
+ return true;
+ }
if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG
- params.n_threads_draft = std::stoi(argv[i]);
- if (params.n_threads_draft <= 0) {
- params.n_threads_draft = std::thread::hardware_concurrency();
+ params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+ if (params.draft_cpuparams.n_threads <= 0) {
+ params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
+ }
+ if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+ CHECK_ARG
+ std::string mask = argv[i];
+ params.draft_cpuparams.mask_valid = true;
+ invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+ return true;
+ }
+ if (arg == "-Crd" || arg == "--cpu-range-draft") {
+ CHECK_ARG
+ std::string range = argv[i];
+ params.draft_cpuparams.mask_valid = true;
+ invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+ return true;
+ }
+ if (arg == "--prio-draft") {
+ CHECK_ARG
+ params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "--cpu-strict-draft") {
+ params.draft_cpuparams.strict_cpu = true;
+ return true;
+ }
+ if (arg == "--poll-draft") {
+ CHECK_ARG
+ params.draft_cpuparams.poll = std::stoul(argv[i]);
+ return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG
- params.n_threads_batch_draft = std::stoi(argv[i]);
- if (params.n_threads_batch_draft <= 0) {
- params.n_threads_batch_draft = std::thread::hardware_concurrency();
+ params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+ if (params.draft_cpuparams_batch.n_threads <= 0) {
+ params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
+ if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+ CHECK_ARG
+ std::string range = argv[i];
+ params.draft_cpuparams_batch.mask_valid = true;
+ invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+ return true;
+ }
+ if (arg == "--prio-batch-draft") {
+ CHECK_ARG
+ params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "--cpu-strict-batch-draft") {
+ params.draft_cpuparams_batch.strict_cpu = true;
+ return true;
+ }
+ if (arg == "--poll-batch-draft") {
+ CHECK_ARG
+ params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
+ return true;
+ }
if (arg == "-p" || arg == "--prompt") {
CHECK_ARG
params.prompt = argv[i];
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
- options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
- options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
- "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+
+#ifndef GGML_USE_OPENMP
+ // these options are available only with the internal threadpool
+ options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+ options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
+ options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+ options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+ options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
+
+ options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+ options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+ options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
+ options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+ options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
+
+ options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+ options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+ options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+ options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+ options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
+
+ options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+ options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+ options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
+ "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+ options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+ options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
+#endif // GGML_USE_OPENMP
+
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
- options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
- os << "system_info: n_threads = " << params.n_threads;
- if (params.n_threads_batch != -1) {
- os << " (n_threads_batch = " << params.n_threads_batch << ")";
+ os << "system_info: n_threads = " << params.cpuparams.n_threads;
+ if (params.cpuparams_batch.n_threads != -1) {
+ os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
}
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
- cparams.n_threads = params.n_threads;
- cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ cparams.n_threads = params.cpuparams.n_threads;
+ cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
return cparams;
}
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+ struct ggml_threadpool_params tpp;
+
+ ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
+ if (params.mask_valid) {
+ std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
+ }
+
+ tpp.prio = params.priority;
+ tpp.poll = params.poll;
+ tpp.strict_cpu = params.strict_cpu;
+
+ return tpp;
+}
+
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
- fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+ fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
DIMRE_METHOD_MEAN,
};
+struct cpu_params {
+ int n_threads = -1;
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+ bool mask_valid = false; // Default: any CPU
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+ bool strict_cpu = false; // Use strict CPU placement
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
- int32_t n_threads = cpu_get_num_math();
- int32_t n_threads_draft = -1;
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
- int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
+ struct cpu_params cpuparams;
+ struct cpu_params cpuparams_batch;
+ struct cpu_params draft_cpuparams;
+ struct cpu_params draft_cpuparams_batch;
+
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
+ int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1";
std::string public_path = "";
std::string gpt_params_get_system_info(const gpt_params & params);
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
+
//
// String utils
//
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
-struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
- int32_t n_threads = 1;
+ int n_threads = 1;
int32_t n_iterations = 10;
};
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
- pca_params.n_threads = params.n_threads;
- pca_params.n_batch = params.n_pca_batch;
+ pca_params.n_threads = params.cpuparams.n_threads;
+ pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
g_verbose = (params.verbosity == 1);
try {
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+ lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
#include <sstream>
#include <string>
#include <vector>
+#include <thread>
#include "ggml.h"
#include "llama.h"
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
+ std::vector<std::string> cpu_mask;
+ std::vector<bool> cpu_strict;
+ std::vector<int> poll;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
+ ggml_sched_priority prio;
+ int delay;
bool verbose;
output_formats output_format;
output_formats output_format_stderr;
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
+ /* cpu_mask */ {"0x0"},
+ /* cpu_strict */ {false},
+ /* poll */ {50},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
+ /* prio */ GGML_SCHED_PRIO_NORMAL,
+ /* delay */ 0,
/* verbose */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+ printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+ printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+ printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
+ printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
+ printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa;
+ params.prio = cmd_params_defaults.prio;
+ params.delay = cmd_params_defaults.delay;
for (int i = 1; i < argc; i++) {
arg = argv[i];
}
auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+ } else if (arg == "-C" || arg == "--cpu-mask") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = string_split<std::string>(argv[i], split_delim);
+ params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+ } else if (arg == "--cpu-strict") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = string_split<bool>(argv[i], split_delim);
+ params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+ } else if (arg == "--poll") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = string_split<int>(argv[i], split_delim);
+ params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.reps = std::stoi(argv[i]);
+ } else if (arg == "--prio") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+ } else if (arg == "--delay") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
+ if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
+ if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
+ if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params;
}
ggml_type type_k;
ggml_type type_v;
int n_threads;
+ std::string cpu_mask;
+ bool cpu_strict;
+ int poll;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
- for (const auto & nt : params.n_threads) {
+ for (const auto & nt : params.n_threads)
+ for (const auto & cm : params.cpu_mask)
+ for (const auto & cs : params.cpu_strict)
+ for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
+ /* .cpu_mask = */ cm,
+ /* .cpu_strict = */ cs,
+ /* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
+ /* .cpu_mask = */ cm,
+ /* .cpu_strict = */ cs,
+ /* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
+ /* .cpu_mask = */ cm,
+ /* .cpu_strict = */ cs,
+ /* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
int n_batch;
int n_ubatch;
int n_threads;
+ std::string cpu_mask;
+ bool cpu_strict;
+ int poll;
bool has_rpc;
ggml_type type_k;
ggml_type type_v;
n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
+ cpu_mask = inst.cpu_mask;
+ cpu_strict = inst.cpu_strict;
+ poll = inst.poll;
has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k;
type_v = inst.type_v;
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
- "n_threads", "type_k", "type_v",
+ "n_threads", "cpu_mask", "cpu_strict", "poll",
+ "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
- "avg_ts", "stddev_ts"
+ "avg_ts", "stddev_ts",
};
return fields;
}
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
- field == "n_threads" ||
+ field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+ field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
}
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
- std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+ std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
+ ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
return -30;
}
if (field == "t/s") {
- return 16;
+ return 20;
}
if (field == "size" || field == "params") {
return 10;
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads");
}
+ if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+ fields.emplace_back("cpu_mask");
+ }
+ if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+ fields.emplace_back("cpu_strict");
+ }
+ if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+ fields.emplace_back("poll");
+ }
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch");
}
llama_backend_init();
llama_numa_init(params.numa);
+ set_process_priority(params.prio);
+
// initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
llama_kv_cache_clear(ctx);
+ // cool off before the test
+ if (params.delay) {
+ std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+ }
+
+ struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+ if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+ LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+ exit(1);
+ }
+ tpp.strict_cpu = t.cpu_strict;
+ tpp.poll = t.poll;
+ tpp.prio = params.prio;
+
+ struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+ if (!threadpool) {
+ LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+ exit(1);
+ }
+
+ llama_attach_threadpool(ctx, threadpool, NULL);
+
// warmup run
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
llama_print_timings(ctx);
llama_free(ctx);
+
+ ggml_threadpool_free(threadpool);
}
llama_free_model(lmodel);
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
- ctx_params.n_threads = UInt32(n_threads)
- ctx_params.n_threads_batch = UInt32(n_threads)
+ ctx_params.n_threads = Int32(n_threads)
+ ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
- embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+ embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
- embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+ embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
- auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+ auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
return 1;
}
+ LOG("%s: llama threadpool init = n_threads = %d\n",
+ __func__,
+ (int) params.cpuparams.n_threads
+ );
+ struct ggml_threadpool_params tpp_batch =
+ ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+ struct ggml_threadpool_params tpp =
+ ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+ set_process_priority(params.cpuparams.priority);
+
+ struct ggml_threadpool * threadpool_batch = NULL;
+ if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+ threadpool_batch = ggml_threadpool_new(&tpp_batch);
+ if (!threadpool_batch) {
+ LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+ exit(1);
+ }
+
+ // Start the non-batch threadpool in the paused state
+ tpp.paused = true;
+ }
+
+ struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+ if (!threadpool) {
+ LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+ exit(1);
+ }
+
+ llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+ if (ctx_guidance) {
+ llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
+ }
+
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
llama_sampling_free(ctx_sampling);
llama_backend_free();
+ ggml_threadpool_free(threadpool);
+ ggml_threadpool_free(threadpool_batch);
+
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
});
LOG_INFO("system info", {
- {"n_threads", params.n_threads},
- {"n_threads_batch", params.n_threads_batch},
+ {"n_threads", params.cpuparams.n_threads},
+ {"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
- if (params.n_threads_draft > 0) {
- params.n_threads = params.n_threads_draft;
+ if (params.draft_cpuparams.n_threads > 0) {
+ params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
}
- params.n_threads_batch = params.n_threads_batch_draft;
+
+ params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context;
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator
struct ggml_tallocr {
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
+ GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
+#define GGML_MAX_N_THREADS 512
+
#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
+ // Scheduling priorities
+ enum ggml_sched_priority {
+ GGML_SCHED_PRIO_NORMAL,
+ GGML_SCHED_PRIO_MEDIUM,
+ GGML_SCHED_PRIO_HIGH,
+ GGML_SCHED_PRIO_REALTIME
+ };
+
+ // Threadpool params
+ // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+ struct ggml_threadpool_params {
+ bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+ int n_threads; // number of threads
+ enum ggml_sched_priority prio; // thread priority
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
+ bool strict_cpu; // strict cpu placement
+ bool paused; // start in paused state
+ };
+
+ struct ggml_threadpool; // forward declaration, see ggml.c
+
+ typedef struct ggml_threadpool * ggml_threadpool_t;
+
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
+ struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
+ GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
+ GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
+ GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+ GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
+ GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
- GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+ GGML_API struct ggml_cplan ggml_graph_plan(
+ const struct ggml_cgraph * cgraph,
+ int n_threads, /* = GGML_DEFAULT_N_THREADS */
+ struct ggml_threadpool * threadpool /* = NULL */ );
+ GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
add_compile_definitions(_GNU_SOURCE)
endif()
#endif
struct ggml_backend_cpu_context {
- int n_threads;
- void * work_data;
- size_t work_size;
+ int n_threads;
+ ggml_threadpool_t threadpool;
+
+ void * work_data;
+ size_t work_size;
ggml_abort_callback abort_callback;
void * abort_callback_data;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data);
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
+ ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->n_threads = n_threads;
}
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
+ // already had a different threadpool, pause/suspend it before switching
+ ggml_threadpool_pause(ctx->threadpool);
+ }
+ ctx->threadpool = threadpool;
+}
+
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
#endif
#include <windows.h>
+#if !defined(__clang__)
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
typedef atomic_int atomic_flag;
#define ATOMIC_FLAG_INIT 0
+typedef enum {
+ memory_order_relaxed,
+ memory_order_consume,
+ memory_order_acquire,
+ memory_order_release,
+ memory_order_acq_rel,
+ memory_order_seq_cst
+} memory_order;
+
static void atomic_store(atomic_int * ptr, LONG val) {
InterlockedExchange(ptr, val);
}
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+ // TODO: add support for explicit memory order
+ InterlockedExchange(ptr, val);
+}
static LONG atomic_load(atomic_int * ptr) {
return InterlockedCompareExchange(ptr, 0, 0);
}
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+ // TODO: add support for explicit memory order
+ return InterlockedCompareExchange(ptr, 0, 0);
+}
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc);
}
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
- return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+ // TODO: add support for explicit memory order
+ return InterlockedExchangeAdd(ptr, inc);
}
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
return InterlockedExchange(ptr, 1);
static void atomic_flag_clear(atomic_flag * ptr) {
InterlockedExchange(ptr, 0);
}
+#else // clang
+#include <stdatomic.h>
+#endif
typedef HANDLE pthread_t;
return 0;
}
#else
+
#include <pthread.h>
#include <stdatomic.h>
+#include <sched.h>
typedef void * thread_ret_t;
struct ggml_context context;
};
-struct ggml_compute_state_shared {
- const struct ggml_cgraph * cgraph;
- const struct ggml_cplan * cplan;
+//
+// Threading defs
+//
+
+typedef pthread_t ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK ggml_mutex_t;
+
+#define ggml_mutex_init(m) InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c) InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join pthread_join
+
+#else
- int n_threads;
+typedef pthread_cond_t ggml_cond_t;
+typedef pthread_mutex_t ggml_mutex_t;
+
+#define ggml_mutex_init(m) pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m) pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m) pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m) pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x) UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x) _mm_pause()
+#else
+#define ggml_lock_lock(x) UNUSED(x)
+#endif
+#define ggml_lock_unlock(x) UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c) pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c) pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m) pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_threadpool {
+ ggml_mutex_t mutex; // mutex for cond.var
+ ggml_cond_t cond; // cond.var for waiting for new work
+
+ struct ggml_cgraph * cgraph;
+ struct ggml_cplan * cplan;
// synchronization primitives
+ atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
atomic_int n_barrier;
atomic_int n_barrier_passed;
+ atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
- ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
- void * abort_callback_data;
+ // these are atomic as an annotation for thread-sanitizer
+ atomic_bool stop; // Used for stopping the threadpool altogether
+ atomic_bool pause; // Used for pausing the threadpool or individual threads
- atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
+ struct ggml_compute_state * workers; // per thread state
+ int n_threads_max; // number of threads in the pool
+ int n_threads_cur; // number of threads used in the current graph
+
+ int32_t prio; // Scheduling priority
+ uint32_t poll; // Polling level (0 - no polling)
enum ggml_status ec;
};
+// Per-thread state
struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
ggml_thread_t thrd;
+ bool cpumask[GGML_MAX_N_THREADS];
+ int last_graph;
+ bool pending;
+#endif
+ struct ggml_threadpool * threadpool;
int ith;
- struct ggml_compute_state_shared * shared;
};
struct ggml_compute_params {
size_t wsize;
void * wdata;
- struct ggml_compute_state_shared * shared;
+ struct ggml_threadpool * threadpool;
};
//
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void ggml_thread_cpu_relax(void) {
+ __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void ggml_thread_cpu_relax(void) {
+ _mm_pause();
+}
+#else
+static inline void ggml_thread_cpu_relax(void) {;}
+#endif
+
//
// NUMA support
//
}
#ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
- if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
+ if (threadpool->n_threads_cur == 1) {
return;
}
#pragma omp barrier
}
#else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
- if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
+ if (threadpool->n_threads_cur == 1) {
return;
}
- atomic_int * n_barrier = &shared->n_barrier;
- atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+ atomic_int * n_barrier = &threadpool->n_barrier;
+ atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
- int n_threads = shared->n_threads;
- int passed_old = atomic_load(n_barrier_passed);
+ int n_threads = threadpool->n_threads_cur;
+ int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
// last thread
atomic_store(n_barrier, 0);
- atomic_fetch_add(n_barrier_passed, 1);
+ atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
} else {
// wait for other threads
- const int n_spin_before_sleep = 100000;
while (true) {
- for (int i = 0; i < n_spin_before_sleep; i++) {
- if (atomic_load(n_barrier_passed) != passed_old) {
- return;
- }
- #if defined(__SSE3__)
- _mm_pause();
- #endif
+ if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+ return;
}
- sched_yield();
+ ggml_thread_cpu_relax();
}
}
}
((char *) src0->data),
ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
}
const int ith = params->ith;
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
- atomic_store(¶ms->shared->current_chunk, nth);
+ atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed);
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
break;
}
- current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1);
+ current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed);
}
}
}
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
// compute each matrix multiplication in sequence
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
if (ith == 0) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
// dst[:,:,:,:] = 0
// for i2,i3:
if (ith == 0) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
// parallelize by last three dimensions
((char *) src0->data),
ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
}
const int ith = params->ith;
((char *) src0->data),
ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
}
// TODO: handle transposed/permuted matrices
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
memset(dst->data, 0, ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
const int32_t stride = ggml_get_op_params_i32(dst, 0);
if (ith == 0) {
memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
const int64_t elem_q = ggml_nelements(q);
const int64_t elem_k = ggml_nelements(k);
if (params->ith == 0) {
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
}
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
if (ith == 0) {
memset(sums, 0, sizeof(float) * (nth + nth * nc));
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
// rows per thread
const int dr = (nr + nth - 1)/nth;
}
#endif
}
- ggml_barrier(params->shared);
+ ggml_barrier(params->threadpool);
if (ith == 0) {
float * dp = (float *) dst->data;
ggml_hash_set_reset(&cgraph->visited_hash_set);
}
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x) UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock os_unfair_lock_lock
-//#define ggml_lock_unlock os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x) UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x) UNUSED(x)
-#define ggml_lock_unlock(x) UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock pthread_spin_lock
-//#define ggml_lock_unlock pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x) UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x) _mm_pause()
-#else
-#define ggml_lock_lock(x) UNUSED(x)
-#endif
-#define ggml_lock_unlock(x) UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join pthread_join
-
-#endif
-
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
return n_tasks;
}
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+bool ggml_thread_apply_affinity(bool * mask) {
+ HANDLE h = GetCurrentThread();
+ uint64_t bitmask = 0ULL;
+
+ assert(GGML_MAX_N_THREADS >= 64);
+
+ for (int32_t i = 0; i < 8; i++) {
+ int32_t idx = i * 8;
+ uint8_t val = 0;
+ val |= mask[idx + 0] << 0;
+ val |= mask[idx + 1] << 1;
+ val |= mask[idx + 2] << 2;
+ val |= mask[idx + 3] << 3;
+ val |= mask[idx + 4] << 4;
+ val |= mask[idx + 5] << 5;
+ val |= mask[idx + 6] << 6;
+ val |= mask[idx + 7] << 7;
+ bitmask |= (uint64_t)val << idx;
+ }
+
+ for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+ if (mask[i]) {
+ fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+ break;
+ }
+ }
+
+ DWORD_PTR m = (DWORD_PTR)bitmask;
+
+ m = SetThreadAffinityMask(h, m);
+
+ return m != 0;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+ // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+ // This is up to the applications.
+ DWORD p = THREAD_PRIORITY_NORMAL;
+ switch (prio) {
+ case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
+ case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
+ case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
+ case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+ }
+
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
+ // Keep inherited policy/priority
+ return true;
+ }
+
+ if (!SetThreadPriority(GetCurrentThread(), p)) {
+ fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+ return false;
+ }
+
+ return true;
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+ // Not supported on Apple platforms
+ UNUSED(mask);
+ return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+ struct sched_param p;
+ int32_t policy = SCHED_OTHER;
+ switch (prio) {
+ case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
+ case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
+ case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
+ case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
+ }
+
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
+ // Keep inherited policy/priority
+ return true;
+ }
+
+ int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+ if (err != 0) {
+ fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+ return false;
+ }
+
+ return true;
+}
+
+#else // posix?
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+ cpu_set_t cpuset;
+ int err;
+
+ CPU_ZERO(&cpuset);
+
+ for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+ if (mask[i]) {
+ GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+ CPU_SET(i, &cpuset);
+ }
+ }
+
+#ifdef __ANDROID__
+ err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+ if (err < 0) {
+ err = errno;
+ }
+#else
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+ if (err != 0) {
+ fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
+ return false;
+ }
+
+ return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+ struct sched_param p;
+ int32_t policy = SCHED_OTHER;
+ switch (prio) {
+ case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
+ case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
+ case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
+ case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
+ }
+
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
+ // Keep inherited policy/priority
+ return true;
+ }
+
+ int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+ if (err != 0) {
+ fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+ return false;
+ }
+
+ return true;
+}
+
+#endif
+
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+ for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+ if (mask[i]) { return true; }
+ }
+ return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+ if (!strict) {
+ memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+ return;
+ } else {
+ memset(local_mask, 0, GGML_MAX_N_THREADS);
+ int32_t base_idx = *iter;
+ for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+ int32_t idx = base_idx + i;
+ if (idx >= GGML_MAX_N_THREADS) {
+ // Just a cheaper modulo
+ idx -= GGML_MAX_N_THREADS;
+ }
+ if (global_mask[idx]) {
+ local_mask[idx] = 1;
+ *iter = idx + 1;
+ return;
+ }
+ }
+ }
+}
+
+void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
+ if (!threadpool) return;
+
+#ifndef GGML_USE_OPENMP
+ struct ggml_compute_state* workers = threadpool->workers;
+ const int n_threads = threadpool->n_threads_max;
+
+ ggml_mutex_lock(&threadpool->mutex);
+
+ threadpool->stop = true;
+ threadpool->pause = false;
+
+ ggml_cond_broadcast(&threadpool->cond);
+ ggml_mutex_unlock(&threadpool->mutex);
+
+ for (int j = 1; j < n_threads; j++) {
+ int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+ GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+ UNUSED(rc);
+ }
+
+ ggml_mutex_destroy(&threadpool->mutex);
+ ggml_cond_destroy(&threadpool->cond);
+#endif // GGML_USE_OPENMP
+
+ GGML_ALIGNED_FREE(threadpool->workers);
+ GGML_ALIGNED_FREE(threadpool);
+}
+
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
+ GGML_PRINT_DEBUG("Pausing threadpool\n");
+ threadpool->pause = true;
+ ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
+ GGML_PRINT_DEBUG("Resuming threadpool\n");
+ threadpool->pause = false;
+ ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
+void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+ ggml_mutex_lock(&threadpool->mutex);
+ if (!threadpool->pause) {
+ ggml_threadpool_pause_locked(threadpool);
+ }
+ ggml_mutex_unlock(&threadpool->mutex);
+#else
+ UNUSED(threadpool);
+#endif
+}
+
+void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+ ggml_mutex_lock(&threadpool->mutex);
+ if (threadpool->pause) {
+ ggml_threadpool_resume_locked(threadpool);
+ }
+ ggml_mutex_unlock(&threadpool->mutex);
+#else
+ UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+ const struct ggml_cgraph * cgraph,
+ int n_threads,
+ struct ggml_threadpool * threadpool) {
+
+ if (threadpool == NULL) {
+ GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+ }
if (n_threads <= 0) {
- n_threads = GGML_DEFAULT_N_THREADS;
+ n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
}
size_t work_size = 0;
}
if (work_size > 0) {
- work_size += CACHE_LINE_SIZE*(n_threads - 1);
+ work_size += CACHE_LINE_SIZE*(n_threads);
}
- cplan.n_threads = MIN(max_tasks, n_threads);
- cplan.work_size = work_size;
- cplan.work_data = NULL;
+ cplan.threadpool = threadpool;
+ cplan.n_threads = MIN(max_tasks, n_threads);
+ cplan.work_size = work_size;
+ cplan.work_data = NULL;
return cplan;
}
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
- const struct ggml_cgraph * cgraph = state->shared->cgraph;
- const struct ggml_cplan * cplan = state->shared->cplan;
+ const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
+ const struct ggml_cplan * cplan = state->threadpool->cplan;
set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = {
- /*.ith =*/ state->ith,
- /*.nth =*/ state->shared->n_threads,
- /*.wsize =*/ cplan->work_size,
- /*.wdata =*/ cplan->work_data,
- /*.shared=*/ state->shared,
+ /*.ith =*/ state->ith,
+ /*.nth =*/ state->threadpool->n_threads_cur,
+ /*.wsize =*/ cplan->work_size,
+ /*.wdata =*/ cplan->work_data,
+ /*.threadpool=*/ state->threadpool,
};
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
ggml_compute_forward(¶ms, node);
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
- state->shared->ec = GGML_STATUS_ABORTED;
+ state->threadpool->ec = GGML_STATUS_ABORTED;
}
- ggml_barrier(state->shared);
+ ggml_barrier(state->threadpool);
- if (state->shared->ec != GGML_STATUS_SUCCESS) {
+ if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
break;
}
}
return 0;
}
+#ifndef GGML_USE_OPENMP
+
+static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
+ struct ggml_threadpool * threadpool = state->threadpool;
+
+ if (state->pending || threadpool->stop || threadpool->pause) { return true; }
+
+ // check for new graph/work
+ int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+ if (new_graph != state->last_graph) {
+ state->pending = (state->ith < threadpool->n_threads_cur);
+ state->last_graph = new_graph;
+ }
+
+ return state->pending;
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+ struct ggml_threadpool * threadpool = state->threadpool;
+
+ // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+ // Perhaps, we can adjust it dynamically based on load and things.
+ const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+ for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+ // No new work. Keep polling.
+ ggml_thread_cpu_relax();
+ }
+
+ return state->pending;
+}
+
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+ struct ggml_threadpool * threadpool = state->threadpool;
+
+ if (ggml_graph_compute_poll_for_work(state)) {
+ return state->pending;
+ }
+
+ ggml_mutex_lock_shared(&threadpool->mutex);
+ while (!ggml_graph_compute_ready(state)) {
+ // No new work. Wait for the signal.
+ GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+ ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+ }
+ ggml_mutex_unlock_shared(&threadpool->mutex);
+
+ return state->pending;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+ struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+ struct ggml_threadpool * threadpool = state->threadpool;
+
+ ggml_thread_apply_priority(threadpool->prio);
+ if (ggml_thread_cpumask_is_valid(state->cpumask)) {
+ ggml_thread_apply_affinity(state->cpumask);
+ }
+
+ while (true) {
+ // Check if we need to sleep
+ while (threadpool->pause) {
+ GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+ ggml_mutex_lock_shared(&threadpool->mutex);
+ if (threadpool->pause) {
+ ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+ }
+ GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+ ggml_mutex_unlock_shared(&threadpool->mutex);
+ }
+
+ // This needs to be checked for after the cond_wait
+ if (threadpool->stop) break;
+
+ // Check if there is new work
+ // The main thread is the only one that can dispatch new work
+
+ ggml_graph_compute_check_for_work(state);
+ if (state->pending) {
+ state->pending = false;
+
+ ggml_graph_compute_thread(state);
+ }
+ }
+
+ return (thread_ret_t) 0;
+}
+
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
+{
+ // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+ ggml_mutex_lock(&threadpool->mutex);
+
+ atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+
+ if (threadpool->pause) {
+ // Update main thread prio and affinity to match the threadpool settings
+ ggml_thread_apply_priority(threadpool->prio);
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+ ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+ }
+
+ // resume does cond broadcast
+ ggml_threadpool_resume_locked(threadpool);
+ } else {
+ ggml_cond_broadcast(&threadpool->cond);
+ }
+
+ ggml_mutex_unlock(&threadpool->mutex);
+}
+
+#endif // GGML_USE_OPENMP
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+ p->n_threads = n_threads;
+ p->prio = 0; // default priority (usually means normal or inherited)
+ p->poll = 50; // hybrid-polling enabled
+ p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+ p->paused = false; // threads are ready to go
+ memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+ struct ggml_threadpool_params p;
+ ggml_threadpool_params_init(&p, n_threads);
+ return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+ if (p0->n_threads != p1->n_threads ) return false;
+ if (p0->prio != p1->prio ) return false;
+ if (p0->poll != p1->poll ) return false;
+ if (p0->strict_cpu != p1->strict_cpu ) return false;
+ return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
+
+static struct ggml_threadpool * ggml_threadpool_new_impl(
+ struct ggml_threadpool_params * tpp,
+ struct ggml_cgraph * cgraph,
+ struct ggml_cplan * cplan) {
+
+ struct ggml_threadpool * threadpool =
+ GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
+ {
+ threadpool->cgraph = cgraph;
+ threadpool->cplan = cplan;
+ threadpool->n_graph = 0;
+ threadpool->n_barrier = 0;
+ threadpool->n_barrier_passed = 0;
+ threadpool->current_chunk = 0;
+ threadpool->stop = false;
+ threadpool->pause = tpp->paused;
+ threadpool->workers = NULL;
+ threadpool->n_threads_max = tpp->n_threads;
+ threadpool->n_threads_cur = tpp->n_threads;
+ threadpool->poll = tpp->poll;
+ threadpool->prio = tpp->prio;
+ threadpool->ec = GGML_STATUS_SUCCESS;
+ }
+
+ // Allocate and init workers state
+ const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
+ struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
+
+ memset(workers, 0, workers_size);
+ for (int j = 0; j < tpp->n_threads; j++) {
+ workers[j].threadpool = threadpool;
+ workers[j].ith = j;
+ }
+
+ threadpool->workers = workers;
+
+#ifndef GGML_USE_OPENMP
+ ggml_mutex_init(&threadpool->mutex);
+ ggml_cond_init(&threadpool->cond);
+
+ // Spin the threads for all workers, and update CPU placements.
+ // Place the main thread last (towards the higher numbered CPU cores).
+
+ int32_t cpumask_iter = 0;
+
+ for (int j = 1; j < tpp->n_threads; j++) {
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+ int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
+ GGML_ASSERT(rc == 0);
+ }
+
+ ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+ if (!threadpool->pause) {
+ // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+ ggml_thread_apply_priority(threadpool->prio);
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+ ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+ }
+ }
+#endif // GGML_USE_OPENMP
+
+ return threadpool;
+}
+
+struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
+ return ggml_threadpool_new_impl(tpp, NULL, NULL);
+}
+
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0);
GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
- int n_threads = cplan->n_threads;
-
- struct ggml_compute_state_shared state_shared = {
- /*.cgraph =*/ cgraph,
- /*.cgraph_plan =*/ cplan,
- /*.n_threads =*/ n_threads,
- /*.n_barrier =*/ 0,
- /*.n_barrier_passed =*/ 0,
- /*.abort_callback =*/ NULL,
- /*.abort_callback_data =*/ NULL,
- /*.current_chunk =*/ 0,
- /*.ec =*/ GGML_STATUS_SUCCESS,
- };
+ int n_threads = cplan->n_threads;
+ struct ggml_threadpool * threadpool = cplan->threadpool;
+
+ bool disposable_threadpool = false;
+
+ if (threadpool == NULL) {
+ GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+ disposable_threadpool = true;
+
+ struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
+ threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+ } else {
+ // Reset some of the parameters that need resetting
+ // No worker threads should be accessing the parameters below at this stage
+ threadpool->cgraph = cgraph;
+ threadpool->cplan = cplan;
+ threadpool->n_threads_cur = n_threads;
+ threadpool->current_chunk = 0;
+ threadpool->ec = GGML_STATUS_SUCCESS;
+ }
+
+ if (n_threads > threadpool->n_threads_max) {
+ GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+ }
#ifdef GGML_USE_OPENMP
if (n_threads > 1) {
{
// update the number of threads from the actual number of threads that we got from OpenMP
n_threads = omp_get_num_threads();
- state_shared.n_threads = n_threads;
+ threadpool->n_threads_cur = n_threads;
}
- struct ggml_compute_state worker = {
- .thrd = 0,
- .ith = omp_get_thread_num(),
- .shared = &state_shared,
- };
- ggml_graph_compute_thread(&worker);
+ ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
}
} else {
- struct ggml_compute_state worker = {
- .thrd = 0,
- .ith = 0,
- .shared = &state_shared,
- };
- ggml_graph_compute_thread(&worker);
+ ggml_graph_compute_thread(&threadpool->workers[0]);
}
#else
- struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
- for (int j = 0; j < n_threads; ++j) {
- workers[j] = (struct ggml_compute_state) {
- .thrd = 0,
- .ith = j,
- .shared = &state_shared,
- };
- }
-
- // create thread pool
- for (int j = 1; j < n_threads; ++j) {
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
- GGML_ASSERT(rc == 0);
- UNUSED(rc);
- }
-
- // this is a work thread too
- ggml_graph_compute_thread(&workers[0]);
+ // Kick all threads to start the new graph
+ ggml_graph_compute_kickoff(threadpool);
- // join or kill thread pool
- if (n_threads > 1) {
- for (int j = 1; j < n_threads; j++) {
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
- GGML_ASSERT(rc == 0);
- UNUSED(rc);
- }
- }
+ // This is a work thread too
+ ggml_graph_compute_thread(&threadpool->workers[0]);
#endif
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
- return state_shared.ec;
+ enum ggml_status ret = threadpool->ec;
+
+ if (disposable_threadpool) {
+ ggml_threadpool_free(threadpool);
+ }
+
+ return ret;
}
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
- struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
opt->iter = iter;
}
- struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
- uint32_t n_threads; // number of threads to use for generation
- uint32_t n_threads_batch; // number of threads to use for batch processing
+ int32_t n_threads; // number of threads to use for generation
+ int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
+ // Optional: an auto threadpool gets created in ggml if not passed explicitly
+ LLAMA_API void llama_attach_threadpool(
+ struct llama_context * ctx,
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch);
+ LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
// Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
- LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+ LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
// Get the number of threads used for generation of a single token.
- LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+ LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token).
- LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+ LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
- uint32_t n_threads; // number of threads to use for generation
- uint32_t n_threads_batch; // number of threads to use for batch processing
+ int n_threads; // number of threads to use for generation
+ int n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
#endif
ggml_backend_t backend_cpu = nullptr;
+ ggml_threadpool_t threadpool = nullptr;
+ ggml_threadpool_t threadpool_batch = nullptr;
+
bool has_evaluated_once = false;
int64_t t_start_us;
}
static void llama_graph_compute(
- llama_context & lctx,
- ggml_cgraph * gf,
- int n_threads) {
+ llama_context & lctx,
+ ggml_cgraph * gf,
+ int n_threads,
+ ggml_threadpool * threadpool) {
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+ ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
#ifdef GGML_USE_BLAS
}
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+ ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
GGML_ASSERT(n_threads > 0);
// non-causal masks do not use the KV cache
llama_set_inputs(lctx, ubatch);
- llama_graph_compute(lctx, gf, n_threads);
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
// update the kv ring buffer
{
lctx.inp_embd_enc = NULL;
lctx.n_outputs = n_tokens;
- const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+ ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
GGML_ASSERT(n_threads > 0);
ggml_backend_sched_reset(lctx.sched);
llama_set_inputs(lctx, ubatch);
- llama_graph_compute(lctx, gf, n_threads);
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
// extract embeddings
if (embd) {
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
#endif
//const int64_t t_end = ggml_time_us();
llama_set_k_shift(lctx);
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
need_reserve = true;
}
}
}
+void llama_attach_threadpool(
+ struct llama_context * ctx,
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch) {
+ ctx->threadpool = threadpool;
+ ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+ ctx->threadpool = nullptr;
+ ctx->threadpool_batch = nullptr;
+}
+
void llama_backend_free(void) {
ggml_quantize_free();
}
}
}
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
}
-uint32_t llama_n_threads(struct llama_context * ctx) {
+int32_t llama_n_threads(struct llama_context * ctx) {
return ctx->cparams.n_threads;
}
-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
return ctx->cparams.n_threads_batch;
}
}
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);