llama-fit-params: free memory target per device (#18679)

author Johannes Gäßler <redacted>

Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)

committer GitHub <redacted>

Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)
author Johannes Gäßler <redacted>
Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)
committer GitHub <redacted>
Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index 26c790c7e0bb2ec15b0058ace5cba7852c93cb1e..9c0e6fbe789214bb9ec030fa685080fba93e54f7 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2255,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              std::vector<std::string> split_arg{ it, {} };
              if (split_arg.size() >= llama_max_devices()) {
                  throw std::invalid_argument(
-                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
                  );
              }
              for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2295,10 +2295,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_env("LLAMA_ARG_FIT"));
      add_opt(common_arg(
-        { "-fitt", "--fit-target" }, "MiB",
-        string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
-        [](common_params & params, int value) {
-            params.fit_params_target = value * size_t(1024*1024);
+        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
+        string_format("target margin per device for --fit, comma-separated list of values, "
+            "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
+        [](common_params & params, const std::string & value) {
+            std::string arg_next = value;
+
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+                );
+            }
+            if (split_arg.size() == 1) {
+                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+                return;
+            }
+            for (size_t i = 0; i < split_arg.size(); i++) {
+                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+            }
          }
      ).set_env("LLAMA_ARG_FIT_TARGET"));
      add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp

index 34fa3b5a422890d5b7b52adf59884b80186f49c9..744f0b4eeb4922bef5ef664be7a347e6c88ae278 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
      if (params.fit_params) {
          LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
          llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
              params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
      }
  
diff --git a/common/common.h b/common/common.h

index d55a6b71fb706ffff999069dca2693022458fdb7..7794c0268bd4a704ff65f782e53e83e0ccc598e3 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -332,12 +332,14 @@ struct common_params {
      // offload params
      std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
  
-    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
-    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
-    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+
+    // margin per device in bytes for fitting parameters to free memory:
+    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
  
      enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
  
diff --git a/include/llama.h b/include/llama.h

index edc4c871a14020d901701cc8f3dd1af62e9a9e00..12e4e57d0e5857283afaaf5c2578d7492704333c 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -495,7 +495,7 @@ extern "C" {
                      struct llama_context_params * cparams,
                                            float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
          struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t   margin,                // margin of memory to leave per device in bytes
+                                         size_t * margins,               // margins of memory to leave per device in bytes
                                         uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                              enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
  
diff --git a/src/llama.cpp b/src/llama.cpp

index dfefb3d2b50c599d73c96fc02737551068b03e9a..33f51a238901a40e2c6d0846f6983e3b49685393 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -147,9 +147,8 @@ class llama_params_fit_exception : public std::runtime_error {
  static void llama_params_fit_impl(
          const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
          float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
      constexpr int64_t MiB = 1024*1024;
-    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
      typedef std::vector<llama_device_memory_data> dmds_t;
      const llama_model_params default_mparams = llama_model_default_params();
  
@@ -168,6 +167,12 @@ static void llama_params_fit_impl(
          return;
      }
  
+    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
+    margins.reserve(nd);
+    for (size_t id = 0; id < nd; id++) {
+        margins.push_back(margins_s[id]);
+    }
+
      std::vector<std::string> dev_names;
      {
          dev_names.reserve(nd);
@@ -187,9 +192,10 @@ static void llama_params_fit_impl(
  
      int64_t sum_free            = 0;
      int64_t sum_projected_free  = 0;
-    int64_t min_projected_free  = INT64_MAX;
      int64_t sum_projected_used  = 0;
      int64_t sum_projected_model = 0;
+    std::vector<int64_t> projected_free_per_device;
+    projected_free_per_device.reserve(nd);
  
      if (nd > 1) {
          LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -199,45 +205,63 @@ static void llama_params_fit_impl(
  
          const int64_t projected_used = dmd.mb.total();
          const int64_t projected_free = dmd.free - projected_used;
+        projected_free_per_device.push_back(projected_free);
  
          sum_free            += dmd.free;
          sum_projected_used  += projected_used;
          sum_projected_free  += projected_free;
-        min_projected_free   = std::min(min_projected_free, projected_free);
          sum_projected_model += dmd.mb.model;
  
          if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
-                projected_free >= 0 ? "surplus" : "deficit");
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
          }
      }
      assert(sum_free >= 0 && sum_projected_used >= 0);
      LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
          __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (min_projected_free >= margin) {
-        if (nd == 1) {
+    if (nd == 1) {
+        if (projected_free_per_device[0] >= margins[0]) {
              LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, min_projected_free/MiB, margin/MiB);
+                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+            return;
+        }
+    } else {
+        bool changes_needed = false;
+        for (size_t id = 0; id < nd; id++) {
+            if (projected_free_per_device[id] < margins[id]) {
+                changes_needed = true;
+                break;
+            }
+        }
+        if (!changes_needed) {
+            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
              return;
          }
-        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
-            __func__, min_projected_free/MiB, margin/MiB);
-        return;
      }
  
      // step 2: try reducing memory use by reducing the context size
  
      {
-        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
+        int64_t global_surplus = sum_projected_free;
+        for (size_t id = 0; id < nd; id++) {
+            global_surplus -= margins[id];
+        }
          if (global_surplus < 0) {
-            LLAMA_LOG_INFO(nd == 1 ?
-                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
-                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
-                __func__, margin/MiB, -global_surplus/MiB);
+            if (nd == 1) {
+                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                    __func__, margins[0]/MiB, -global_surplus/MiB);
+            } else {
+                LLAMA_LOG_INFO(
+                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
+                    __func__, -global_surplus/MiB);
+            }
              if (cparams->n_ctx == 0) {
                  if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free - nd*margin_s;
+                    int64_t sum_used_target = sum_free;
+                    for (size_t id = 0; id < nd; id++) {
+                        sum_used_target -= margins[id];
+                    }
                      if (nd > 1) {
                          // for multiple devices we need to be more conservative in terms of how much context we think can fit:
                          //   - for dense models only whole layers can be assigned to devices
@@ -448,9 +472,9 @@ static void llama_params_fit_impl(
          const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
              path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
  
-        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
-            global_surplus_cpu_moe += dmd.free;
-            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
+        for (size_t id = 0; id < nd; id++) {
+            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
+            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
          }
  
          if (global_surplus_cpu_moe > 0) {
@@ -469,7 +493,7 @@ static void llama_params_fit_impl(
      std::vector<int64_t> targets; // maximum acceptable memory use per device
      targets.reserve(nd);
      for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margin);
+        targets.push_back(dmds_full[id].free - margins[id]);
          LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
      }
  
@@ -701,11 +725,11 @@ static void llama_params_fit_impl(
  enum llama_params_fit_status llama_params_fit(
          const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
          float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
      const int64_t t0_us = llama_time_us();
      llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
      try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
          LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
      } catch (const llama_params_fit_exception & e) {
          LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp

index c7e7748ca93d59afdcc245e88b80f7812c24d7b7..f9d9cb34c7d424ea9a56330cf7cf8936eb5772f2 100644 (file)
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
      auto mparams = common_model_params_to_llama(params);
      auto cparams = common_context_params_to_llama(params);
      const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
          params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
      if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
          LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
author	Johannes Gäßler <redacted>
	Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)
committer	GitHub <redacted>
	Thu, 8 Jan 2026 09:07:58 +0000 (10:07 +0100)
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama.cpp		patch \| blob \| history
tools/fit-params/fit-params.cpp		patch \| blob \| history