server : fix crash when seq_rm fails for hybrid/recurrent models (#18391)

author o7si <redacted>

Fri, 26 Dec 2025 15:35:29 +0000 (23:35 +0800)

committer GitHub <redacted>

Fri, 26 Dec 2025 15:35:29 +0000 (16:35 +0100)
author o7si <redacted>
Fri, 26 Dec 2025 15:35:29 +0000 (23:35 +0800)
committer GitHub <redacted>
Fri, 26 Dec 2025 15:35:29 +0000 (16:35 +0100)
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp

index 94825dc8621fa8c9a61cd4ed8d376513e8ad029f..1abbf6d6d9513bacc161bbd307ae007ad6dc98c4 100644 (file)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1007,8 +1007,10 @@ private:
          return ret;
      }
  
-    void clear_slot(server_slot & slot) const {
-        GGML_ASSERT(!slot.is_processing());
+    void clear_slot(server_slot & slot, bool allow_processing = false) const {
+        if (!allow_processing) {
+            GGML_ASSERT(!slot.is_processing());
+        }
  
          SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
  
@@ -2336,7 +2338,7 @@ private:
                      if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                          SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
  
-                        clear_slot(slot);
+                        clear_slot(slot, /*allow_processing=*/true);
  
                          // there is no common part left
                          slot.n_prompt_tokens_cache = 0;