}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+ GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
- string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
- [](common_params & params, int value) {
- params.n_gpu_layers = value;
+ string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.n_gpu_layers = -2;
+ } else {
+ params.n_gpu_layers = std::stoi(value);
+ }
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
- "number of layers to store in VRAM for the draft model",
- [](common_params & params, int value) {
- params.speculative.n_gpu_layers = value;
+ string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
+ params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.speculative.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.speculative.n_gpu_layers = -2;
+ } else {
+ params.speculative.n_gpu_layers = std::stoi(value);
+ }
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
mparams.devices = params.devices.data();
}
- if (params.n_gpu_layers != -1) {
- mparams.n_gpu_layers = params.n_gpu_layers;
- }
-
+ mparams.n_gpu_layers = params.n_gpu_layers;
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
- int32_t n_gpu_layers; // number of layers to store in VRAM
+ int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
- // returns true if the parameters could be successfully modified to fit device memory
- // this function is NOT thread safe because it modifies the global llama logger state
+ // - returns true if the parameters could be successfully modified to fit device memory
+ // - this function is NOT thread safe because it modifies the global llama logger state
+ // - only parameters that have the same value as in llama_default_model_params are modified
LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
- model.params.n_gpu_layers > (int) model.hparams.n_layer &&
- model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+ model.n_gpu_layers() > model.hparams.n_layer &&
+ model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
- const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+ const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
bool llama_model::load_tensors(llama_model_loader & ml) {
const auto & split_mode = params.split_mode;
- const auto & n_gpu_layers = params.n_gpu_layers;
const auto & use_mlock = params.use_mlock;
const auto & tensor_split = params.tensor_split;
- const int n_layer = hparams.n_layer;
+ const int n_layer = hparams.n_layer;
+ const int n_gpu_layers = this->n_gpu_layers();
const bool use_mmap_buffer = true;
return devices.size();
}
+uint32_t llama_model::n_gpu_layers() const {
+ return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+}
+
+llama_split_mode llama_model::split_mode() const {
+ return params.split_mode;
+}
+
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
llama_model_params result = {
/*.devices =*/ nullptr,
/*.tensor_buft_overrides =*/ nullptr,
- /*.n_gpu_layers =*/ 999,
+ /*.n_gpu_layers =*/ -1,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr;
- llama_model_params params;
-
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
size_t n_tensors() const;
size_t n_devices() const;
+ uint32_t n_gpu_layers() const;
+ llama_split_mode split_mode() const;
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model
ggml_cgraph * build_graph(const llm_graph_params & params) const;
private:
+ llama_model_params params;
+
struct impl;
std::unique_ptr<impl> pimpl;
};