server : fix slot selection by lru (#10126)

author sasha0552 <redacted>

Sat, 2 Nov 2024 16:34:56 +0000 (16:34 +0000)

committer GitHub <redacted>

Sat, 2 Nov 2024 16:34:56 +0000 (18:34 +0200)
author sasha0552 <redacted>
Sat, 2 Nov 2024 16:34:56 +0000 (16:34 +0000)
committer GitHub <redacted>
Sat, 2 Nov 2024 16:34:56 +0000 (18:34 +0200)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 5c1af549b2e1ce26e6cd95cf07c6a0d39b1c065e..8531a784ded3dae5a423d3e8eff9c0908afc8405 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -247,6 +247,7 @@ struct server_slot {
          if (is_processing()) {
              SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
  
+            t_last_used = ggml_time_us();
              t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
              state = SLOT_STATE_IDLE;
              callback_on_release(id);
@@ -730,7 +731,7 @@ struct server_context {
  
          // find the slot that has at least n% prompt similarity
          if (ret == nullptr && slot_prompt_similarity != 0.0f) {
-            int max_lcs_len = 0;
+            int lcs_len = 0;
              float similarity = 0;
  
              for (server_slot & slot : slots) {
@@ -745,20 +746,21 @@ struct server_context {
                  }
  
                  // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
  
                  // fraction of the common subsequence length compared to the current slot's prompt length
-                similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
+                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
  
                  // select the current slot if the criteria match
-                if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
-                    max_lcs_len = lcs_len;
+                if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
+                    lcs_len = cur_lcs_len;
+                    similarity = cur_similarity;
                      ret = &slot;
                  }
              }
  
              if (ret != nullptr) {
-                SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
+                SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
              }
          }
  
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index 871a17a4f617ab46648a09873e04bbcfccc703e1..c47ed3e47a76dc35aacc7656367e56505769186d 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
      }
  
      // get the lengths of the input sequences
-    int a_len = a.size();
-    int b_len = b.size();
+    size_t a_len = a.size();
+    size_t b_len = b.size();
  
      // initialize the maximum length of the longest common subsequence (LCS)
-    int max_length = 0;
+    size_t max_length = 0;
  
      // use two rows instead of a 2D matrix to optimize space
-    std::vector<int> prev_row(b_len + 1, 0);
-    std::vector<int> curr_row(b_len + 1, 0);
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
  
      // iterate through the elements of a
-    for (int i = 1; i <= a_len; i++) {
+    for (size_t i = 1; i <= a_len; i++) {
          // iterate through the elements of b
-        for (int j = 1; j <= b_len; j++) {
+        for (size_t j = 1; j <= b_len; j++) {
              // if elements at the current positions match
              if (a[i - 1] == b[j - 1]) {
                  // if it's the first element of either sequences, set LCS length to 1
author	sasha0552 <redacted>
	Sat, 2 Nov 2024 16:34:56 +0000 (16:34 +0000)
committer	GitHub <redacted>
	Sat, 2 Nov 2024 16:34:56 +0000 (18:34 +0200)
examples/server/server.cpp		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history