llama_fit_params: return enum for fail vs. error (#18374)

author Johannes Gäßler <redacted>

Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)

committer GitHub <redacted>

Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)
author Johannes Gäßler <redacted>
Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)
committer GitHub <redacted>
Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)
diff --git a/include/llama.h b/include/llama.h

index f86293009916cc99912a097d27de1202116394b1..2d4e9a94e2920ce8fb58799d10cc4726247c964c 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -467,10 +467,16 @@ extern "C" {
      // Frees all allocated memory
      LLAMA_API void llama_free(struct llama_context * ctx);
  
+    enum llama_params_fit_status {
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
+    };
+
      // fits mparams and cparams to free device memory (assumes system memory is unlimited)
      // returns true if the parameters could be successfully modified to fit device memory
      // this function is NOT thread safe because it modifies the global llama logger state
-    LLAMA_API bool llama_params_fit(
+    LLAMA_API enum llama_params_fit_status llama_params_fit(
                                     const char   * path_model,
                      struct llama_model_params   * mparams,
                      struct llama_context_params * cparams,
diff --git a/src/llama.cpp b/src/llama.cpp

index 3428b1bd3f4c353273530d5ced11998345c9d874..c53f2472b3a649f2589efe49f5f87661fd1b1dcf 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -140,6 +140,10 @@ enum layer_fraction_t {
  };
  // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
  
+class llama_params_fit_exception : public std::runtime_error {
+    using std::runtime_error::runtime_error;
+};
+
  static void llama_params_fit_impl(
          const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
          float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -281,28 +285,28 @@ static void llama_params_fit_impl(
      }
  
      if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
      }
      if (nd > 1) {
          if (!tensor_split) {
-            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
+            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
          }
          if (mparams->tensor_split) {
              for (size_t id = 0; id < nd; id++) {
                  if (mparams->tensor_split[id] != 0.0f) {
-                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
+                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
                  }
              }
          }
          if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
          }
      }
      if (!tensor_buft_overrides) {
-        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
+        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
      }
      if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
+        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
      }
  
      // step 3: iteratively fill the back to front with "dense" layers
@@ -385,7 +389,7 @@ static void llama_params_fit_impl(
                      tensor_buft_overrides[itbo].buft    = nullptr;
                      itbo++;
                      mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+                    throw llama_params_fit_exception("llama_params_fit_n_tensor_buft_overrides() == "
                          + std::to_string(ntbo) + " is insufficient for model\n");
                  }
                  tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
@@ -683,22 +687,25 @@ static void llama_params_fit_impl(
      set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
  }
  
-bool llama_params_fit(
+enum llama_params_fit_status llama_params_fit(
          const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
          float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
          size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
      const int64_t t0_us = llama_time_us();
-    bool ok = true;
+    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
      try {
          llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
          LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const std::runtime_error & e) {
+    } catch (const llama_params_fit_exception & e) {
          LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        ok = false;
+        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
+    } catch (const std::runtime_error & e) {
+        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
+        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
      }
      const int64_t t1_us = llama_time_us();
      LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return ok;
+    return status;
  }
  
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp

index de47763d3e72407ebf804063c9b199d5ec9e4b24..c7e7748ca93d59afdcc245e88b80f7812c24d7b7 100644 (file)
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -26,10 +26,10 @@ int main(int argc, char ** argv) {
      llama_numa_init(params.numa);
      auto mparams = common_model_params_to_llama(params);
      auto cparams = common_context_params_to_llama(params);
-    const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+    const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
          params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
          params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
-    if (!success) {
+    if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
          LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
          exit(1);
      }
author	Johannes Gäßler <redacted>
	Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)
committer	GitHub <redacted>
	Sat, 27 Dec 2025 08:59:19 +0000 (09:59 +0100)
include/llama.h		patch \| blob \| history
src/llama.cpp		patch \| blob \| history
tools/fit-params/fit-params.cpp		patch \| blob \| history