From: Georgi Gerganov Date: Wed, 25 Feb 2026 13:15:42 +0000 (+0200) Subject: server : enable multi-modal prompt caching (#19877) X-Git-Tag: gguf-v0.18.0~21 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f20469d91948975e001c286836f714c1819c968f;p=pkg%2Fggml%2Fsources%2Fllama.cpp server : enable multi-modal prompt caching (#19877) --- diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 67c3988bd..73af81243 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -995,9 +995,6 @@ private: // don't update the cache if the slot's context is empty update_cache = update_cache && tokens.size() > 0; - // TODO: mtmd does not support prompt cache - update_cache = update_cache && (ret->mctx == nullptr); - if (update_cache) { SRV_WRN("%s", "updating prompt cache\n"); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 739e30a70..d3aba1848 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1900,10 +1900,9 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t return nullptr; } - // TODO: for some reason we can't copy server_tokens, so we have to do this workaround auto & cur = states.emplace_back(); cur = { - /*.tokens =*/ server_tokens(prompt.tokens.get_text_tokens(), false), + /*.tokens =*/ prompt.tokens.clone(), /*.data =*/ std::move(state_data), /*.checkpoints =*/ prompt.checkpoints, };