graph : fix equal_seq() check (#14986)

author Georgi Gerganov <redacted>

Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)

committer GitHub <redacted>

Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)
author Georgi Gerganov <redacted>
Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)
committer GitHub <redacted>
Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 9e77fe6d869599255729b6ed0e908becf8be390d..bd637f3dffe9c5436769422ef48f98490c7a47b3 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -113,6 +113,15 @@ llama_context::llama_context(
          }
      }
  
+    {
+        const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
+        graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
+
+        if (graph_reuse_disable) {
+            LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
+        }
+    }
+
      const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  
      LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
      // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
      const auto gparams = graph_params(res, ubatch, mctx, gtype);
  
-    if (res->can_reuse(gparams)) {
+    if (!graph_reuse_disable && res->can_reuse(gparams)) {
          //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
  
          n_reused++;
diff --git a/src/llama-context.h b/src/llama-context.h

index 5c3a1c09886ea29178b9427f43abc3c085a7e9f7..7cfdc6a51731a31fde9c2555b4bdc47434aed915 100644 (file)
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -291,6 +291,9 @@ private:
      // ref: https://github.com/ggml-org/llama.cpp/pull/14285
      bool supports_set_rows = false;
  
+    // env: LLAMA_GRAPH_REUSE_DISABLE
+    bool graph_reuse_disable = false;
+
      // perf
      mutable int64_t t_start_us  = 0;
      mutable int64_t t_load_us   = 0;
diff --git a/src/llama-graph.h b/src/llama-graph.h

index 94d778f38473af7d5ff535105c4997c112870af1..8614d4967474bcb3c3ae5eede8b45244b1befb07 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -423,7 +423,9 @@ struct llm_graph_params {
                  (!ubatch.embd  && !other.ubatch.embd)
              );
  
-        if (can_reuse_ubatch && !ubatch.equal_seqs()) {
+        // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
+        //   the reason is because the set of attention streams would be different for different sequences
+        if (can_reuse_ubatch && ubatch.equal_seqs()) {
              if (!ubatch.data) {
                  // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
                  //   therefore we cannot perform the sequence id check. normally should never happen
author	Georgi Gerganov <redacted>
	Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)
committer	GitHub <redacted>
	Fri, 1 Aug 2025 03:38:12 +0000 (06:38 +0300)
src/llama-context.cpp		patch \| blob \| history
src/llama-context.h		patch \| blob \| history
src/llama-graph.h		patch \| blob \| history