}
}
+ {
+ const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
+ graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
+
+ if (graph_reuse_disable) {
+ LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
+ }
+ }
+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
const auto gparams = graph_params(res, ubatch, mctx, gtype);
- if (res->can_reuse(gparams)) {
+ if (!graph_reuse_disable && res->can_reuse(gparams)) {
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
n_reused++;
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = false;
+ // env: LLAMA_GRAPH_REUSE_DISABLE
+ bool graph_reuse_disable = false;
+
// perf
mutable int64_t t_start_us = 0;
mutable int64_t t_load_us = 0;
(!ubatch.embd && !other.ubatch.embd)
);
- if (can_reuse_ubatch && !ubatch.equal_seqs()) {
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
+ // the reason is because the set of attention streams would be different for different sequences
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
if (!ubatch.data) {
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
// therefore we cannot perform the sequence id check. normally should never happen