llama: fix early stop in params_fit if ctx is set (#18070)

author Johannes Gäßler <redacted>

Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)

committer GitHub <redacted>

Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)
author Johannes Gäßler <redacted>
Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)
committer GitHub <redacted>
Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)
diff --git a/src/llama.cpp b/src/llama.cpp

index 7ed34b80ae2fb50e9c446839b5de45792dff9c22..f69964b6d5e33181e770aedeb6de9d603ad7ea3f 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -241,6 +241,13 @@ static void llama_params_fit_impl(
                      global_surplus += memory_reduction;
                      LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                          __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
+                        if (nd == 1) {
+                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
+                            return;
+                        }
+                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
+                    }
                  } else {
                      LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                          __func__, hp_nct, n_ctx_min);
@@ -249,10 +256,6 @@ static void llama_params_fit_impl(
                  LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
              }
          }
-        if (global_surplus >= 0) {
-            LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
-            return;
-        }
      }
  
      if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
author	Johannes Gäßler <redacted>
	Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)
committer	GitHub <redacted>
	Tue, 16 Dec 2025 13:24:00 +0000 (14:24 +0100)