server : fix mtmd checkpoints (#16591)

author Georgi Gerganov <redacted>

Wed, 15 Oct 2025 09:51:27 +0000 (12:51 +0300)

committer GitHub <redacted>

Wed, 15 Oct 2025 09:51:27 +0000 (11:51 +0200)
author Georgi Gerganov <redacted>
Wed, 15 Oct 2025 09:51:27 +0000 (12:51 +0300)
committer GitHub <redacted>
Wed, 15 Oct 2025 09:51:27 +0000 (11:51 +0200)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index 77969d24e13e1d6547b0d0c67f9d315b141cdb92..36dc32fe3d17ecb933841b3aa543b74f47e6d454 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3812,7 +3812,7 @@ struct server_context {
                              if (slot.n_past > 0 && slot.n_past < (int) slot.prompt.tokens.size()) {
                                  const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                  if (pos_min == -1) {
-                                    SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
+                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
                                      GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
                                  }
  
@@ -3860,7 +3860,7 @@ struct server_context {
                                  }
  
                                  if (pos_min > pos_min_thold) {
-                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
+                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
  
                                      // search for a context checkpoint
                                      const auto it = std::find_if(
@@ -4028,7 +4028,7 @@ struct server_context {
                          }
                      }
  
-                    // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
  
                      SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_past / slot.n_prompt_tokens());
  
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp

index fd0bc8de533cf707d30da563078320208f844e91..cc48f5a9d0ac7f03a14a0afa8fa506566739c649 100644 (file)
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1237,9 +1237,10 @@ public:
              // allowed to resize      ^                    ^
              // disallowed to resize          ^      ^             ^
              if (n > 0) {
-                llama_token last_token = tokens[n - 1];
                  // make sure we never remove tokens in the middle of an image
-                if (last_token == LLAMA_TOKEN_NULL) {
+                // note that the case where we keep a full image at the end is allowed:
+                //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
+                if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
                      find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
                  }
              }
author	Georgi Gerganov <redacted>
	Wed, 15 Oct 2025 09:51:27 +0000 (12:51 +0300)
committer	GitHub <redacted>
	Wed, 15 Oct 2025 09:51:27 +0000 (11:51 +0200)
tools/server/server.cpp		patch \| blob \| history
tools/server/utils.hpp		patch \| blob \| history