server : dynamic token limit for prompt cache (#16560)

author Georgi Gerganov <redacted>

Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)

committer GitHub <redacted>

Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)
author Georgi Gerganov <redacted>
Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)
committer GitHub <redacted>
Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index cf12805b4998a9193f840748b78c934786d46359..77969d24e13e1d6547b0d0c67f9d315b141cdb92 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
              }
          }
  
+        // average size per token
+        const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+        // dynamically increase the token limit if it can fit in the memory limit
+        const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
          if (limit_tokens > 0) {
-            while (states.size() > 1 && n_tokens() > limit_tokens) {
+            while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
                  if (states.empty()) {
                      break;
                  }
  
-                SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+                SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                        limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
  
                  states.pop_front();
              }
          }
  
-        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",
-                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
+        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
  
          for (const auto & state : states) {
-            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                    (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
          }
      }
  };
author	Georgi Gerganov <redacted>
	Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)
committer	GitHub <redacted>
	Tue, 14 Oct 2025 05:48:50 +0000 (08:48 +0300)