From: o7si Date: Fri, 26 Dec 2025 15:35:29 +0000 (+0800) Subject: server : fix crash when seq_rm fails for hybrid/recurrent models (#18391) X-Git-Tag: upstream/0.0.7599~56 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=4893cc07bba09525d6a1720d0686ed09b5a9b1c8;p=pkg%2Fggml%2Fsources%2Fllama.cpp server : fix crash when seq_rm fails for hybrid/recurrent models (#18391) * server : fix crash when seq_rm fails for hybrid/recurrent models * server : add allow_processing param to clear_slot --- diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 94825dc8..1abbf6d6 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1007,8 +1007,10 @@ private: return ret; } - void clear_slot(server_slot & slot) const { - GGML_ASSERT(!slot.is_processing()); + void clear_slot(server_slot & slot, bool allow_processing = false) const { + if (!allow_processing) { + GGML_ASSERT(!slot.is_processing()); + } SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size()); @@ -2336,7 +2338,7 @@ private: if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - clear_slot(slot); + clear_slot(slot, /*allow_processing=*/true); // there is no common part left slot.n_prompt_tokens_cache = 0;