From: compilade Date: Thu, 12 Jun 2025 06:56:04 +0000 (-0400) Subject: context : round n_tokens to next multiple of n_seqs when reserving (#14140) X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=a20b2b05bce6622c585459ebf46f142f113d021c;p=pkg%2Fggml%2Fsources%2Fllama.cpp context : round n_tokens to next multiple of n_seqs when reserving (#14140) This fixes RWKV inference which otherwise failed when the worst case ubatch.n_seq_tokens rounded to 0. --- diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b130b484..525a00d8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1332,7 +1332,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); if (n_tokens % n_seqs != 0) { - n_tokens = (n_tokens / n_seqs) * n_seqs; + n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs n_outputs = std::min(n_outputs, n_tokens); LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);