graph : fix stack-use-after-return (#14960)

author Georgi Gerganov <redacted>

Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)

committer GitHub <redacted>

Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)
author Georgi Gerganov <redacted>
Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)
committer GitHub <redacted>
Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)
diff --git a/src/llama-graph.h b/src/llama-graph.h

index d4d565754c5003b77264fa9614397ac4e5ff0c6e..8eae4f5515e3ca43a33908abcd5826eae9d8a768 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -144,7 +144,7 @@ public:
  
      ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
  
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
  };
  
  class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ public:
  
      ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
  
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
  
      const llama_kv_cache_unified_context * mctx;
  };
@@ -177,8 +177,8 @@ public:
  
      ggml_tensor * out_ids; // I32 [n_outputs]
  
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
  
      const uint32_t n_outputs;
  };
@@ -192,7 +192,7 @@ public:
  
      ggml_tensor * mean; // F32 [n_batch, n_batch]
  
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
  };
  
  class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ public:
  
      ggml_tensor * cls; // I32 [n_batch]
  
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
  };
  
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -247,8 +247,8 @@ public:
      ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch, 1, 1]
      ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch, 1, 1]
  
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
  };
  
  class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +278,11 @@ public:
      ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
      ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
  
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    // note: these have to be copies because in order to be able to reuse a graph, its inputs
+    //       need to carry these parameters with them. otherwise, they can point to freed
+    //       llm_graph_params from a previous batch, causing stack-use-after-return
+    const llama_hparams hparams;
+    const llama_cparams cparams;
  
      const llama_kv_cache_unified_context * mctx;
  };
@@ -318,8 +321,8 @@ public:
      ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
      ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
  
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
  
      const llama_kv_cache_unified_iswa_context * mctx;
  };
author	Georgi Gerganov <redacted>
	Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)
committer	GitHub <redacted>
	Wed, 30 Jul 2025 10:52:11 +0000 (13:52 +0300)