llama: fix magic number of 999 for GPU layers (#18266)

author Johannes Gäßler <redacted>

Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)

committer GitHub <redacted>

Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)
author Johannes Gäßler <redacted>
Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)
committer GitHub <redacted>
Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index 774f8731a9f4051b85583f014c72004608b9d257..189470182abe92501400fd6f49ee84a65f733848 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2137,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              }
          }
      ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+    GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
      add_opt(common_arg(
          {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
-        [](common_params & params, int value) {
-            params.n_gpu_layers = value;
+        string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
+        [](common_params & params, const std::string & value) {
+            if (value == "auto") {
+                params.n_gpu_layers = -1;
+            } else if (value == "all") {
+                params.n_gpu_layers = -2;
+            } else {
+                params.n_gpu_layers = std::stoi(value);
+            }
              if (!llama_supports_gpu_offload()) {
                  fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
                  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3175,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.speculative.devices = parse_device_list(value);
          }
      ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
      add_opt(common_arg(
          {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
-        [](common_params & params, int value) {
-            params.speculative.n_gpu_layers = value;
+        string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
+            params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
+        [](common_params & params, const std::string & value) {
+            if (value == "auto") {
+                params.speculative.n_gpu_layers = -1;
+            } else if (value == "all") {
+                params.speculative.n_gpu_layers = -2;
+            } else {
+                params.speculative.n_gpu_layers = std::stoi(value);
+            }
              if (!llama_supports_gpu_offload()) {
                  fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
                  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
diff --git a/common/common.cpp b/common/common.cpp

index acf2ec841d7e9932729f3b34df7ceda5eaf83b7c..8d62893370fd4c016b4cb0e148729efaa91437e9 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1341,10 +1341,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
          mparams.devices = params.devices.data();
      }
  
-    if (params.n_gpu_layers != -1) {
-        mparams.n_gpu_layers = params.n_gpu_layers;
-    }
-
+    mparams.n_gpu_layers    = params.n_gpu_layers;
      mparams.main_gpu        = params.main_gpu;
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
diff --git a/common/common.h b/common/common.h

index 334372073a9db04c8a326175ec26b9a529e477bd..f8bc686b6ff3730bf59babbea9b99037facbe103 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -329,7 +329,7 @@ struct common_params {
      // offload params
      std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
  
-    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM, -1 is auto, <= -2 is all
      int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
      float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
      bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
diff --git a/include/llama.h b/include/llama.h

index 2d4e9a94e2920ce8fb58799d10cc4726247c964c..4f0124fdc87d80a1a07989864baf836c506cb11a 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
          // NULL-terminated list of buffer types to use for tensors that match a pattern
          const struct llama_model_tensor_buft_override * tensor_buft_overrides;
  
-        int32_t n_gpu_layers; // number of layers to store in VRAM
+        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
          enum llama_split_mode split_mode; // how to split the model across multiple GPUs
  
          // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -474,8 +474,9 @@ extern "C" {
      };
  
      // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    // returns true if the parameters could be successfully modified to fit device memory
-    // this function is NOT thread safe because it modifies the global llama logger state
+    //   - returns true if the parameters could be successfully modified to fit device memory
+    //   - this function is NOT thread safe because it modifies the global llama logger state
+    //   - only parameters that have the same value as in llama_default_model_params are modified
      LLAMA_API enum llama_params_fit_status llama_params_fit(
                                     const char   * path_model,
                      struct llama_model_params   * mparams,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 015ebae71d6b32deed71fdcbef0d5fe30f17c429..1c530fdc919f01280067e1f484906825945439ed 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -294,8 +294,8 @@ llama_context::llama_context(
          // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
          bool pipeline_parallel =
              model.n_devices() > 1 &&
-            model.params.n_gpu_layers > (int) model.hparams.n_layer &&
-            model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+            model.n_gpu_layers() > model.hparams.n_layer &&
+            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
              cparams.offload_kqv &&
              !model.has_tensor_overrides();
  
@@ -1570,7 +1570,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
  
          // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
          // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
          if (ubatch.n_tokens < 32 || full_offload) {
              if (il != -1 && strcmp(name, "norm") == 0) {
                  const auto & dev_layer = model.dev_layer(il);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 69075742c915111f898d791f52509331c69903a4..1d6134ec0505691a77355126fcac3e95c327c991 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2378,11 +2378,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
  
  bool llama_model::load_tensors(llama_model_loader & ml) {
      const auto & split_mode   = params.split_mode;
-    const auto & n_gpu_layers = params.n_gpu_layers;
      const auto & use_mlock    = params.use_mlock;
      const auto & tensor_split = params.tensor_split;
  
-    const int n_layer = hparams.n_layer;
+    const int n_layer      = hparams.n_layer;
+    const int n_gpu_layers = this->n_gpu_layers();
  
      const bool use_mmap_buffer = true;
  
@@ -6884,6 +6884,14 @@ size_t llama_model::n_devices() const {
      return devices.size();
  }
  
+uint32_t llama_model::n_gpu_layers() const {
+    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+}
+
+llama_split_mode llama_model::split_mode() const {
+    return params.split_mode;
+}
+
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
      std::map<ggml_backend_buffer_type_t, size_t> ret;
      for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -7794,7 +7802,7 @@ llama_model_params llama_model_default_params() {
      llama_model_params result = {
          /*.devices                     =*/ nullptr,
          /*.tensor_buft_overrides       =*/ nullptr,
-        /*.n_gpu_layers                =*/ 999,
+        /*.n_gpu_layers                =*/ -1,
          /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
          /*.main_gpu                    =*/ 0,
          /*.tensor_split                =*/ nullptr,
diff --git a/src/llama-model.h b/src/llama-model.h

index 9c00eec75f51ed97e2f74c466672fb88fd976db6..dbe5edc15367b510da374732a5a1f3895cfa857e 100644 (file)
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -466,8 +466,6 @@ struct llama_model {
      struct ggml_tensor * dense_2_out_layers = nullptr;
      struct ggml_tensor * dense_3_out_layers = nullptr;
  
-    llama_model_params params;
-
      // gguf metadata
      std::unordered_map<std::string, std::string> gguf_kv;
  
@@ -498,6 +496,9 @@ struct llama_model {
      size_t n_tensors() const;
      size_t n_devices() const;
  
+    uint32_t n_gpu_layers() const;
+    llama_split_mode split_mode() const;
+
      std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
  
      // total number of parameters in the model
@@ -526,6 +527,8 @@ struct llama_model {
      ggml_cgraph * build_graph(const llm_graph_params & params) const;
  
  private:
+    llama_model_params params;
+
      struct impl;
      std::unique_ptr<impl> pimpl;
  };
author	Johannes Gäßler <redacted>
	Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)
committer	GitHub <redacted>
	Sat, 27 Dec 2025 19:18:35 +0000 (20:18 +0100)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-model.h		patch \| blob \| history