tool/ex/tests: consistently free ctx, then model (#18168)

author Johannes Gäßler <redacted>

Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)

committer GitHub <redacted>

Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)
author Johannes Gäßler <redacted>
Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)
committer GitHub <redacted>
Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)
diff --git a/common/common.cpp b/common/common.cpp

index d4e8c7405ebdefc1124b99e2cd60834bfee2944f..acf2ec841d7e9932729f3b34df7ceda5eaf83b7c 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
      impl() = default;
      ~impl() = default;
  
+    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
+
      llama_model_ptr   model;
      llama_context_ptr context;
  
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 8786d4ee3e03d437c932e11693b57cf0024f27db..015ebae71d6b32deed71fdcbef0d5fe30f17c429 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -459,23 +459,22 @@ llama_context::llama_context(
  }
  
  llama_context::~llama_context() {
-    // FIXME this currently results in a use-after-free bug if the model is freed before the context
-    // if (!model.hparams.no_alloc) {
-    //     for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-    //         ggml_backend_t             backend = backend_ptrs[i];
-    //         ggml_backend_buffer_type_t buft    = backend_buft[i];
-
-    //         const size_t size_exp = backend_buf_exp_size[i];
-    //         const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-    //         if (size_exp == size_act) {
-    //             LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
-    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-    //         } else {
-    //             LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
-    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-    //         }
-    //     }
-    // }
+    if (!model.hparams.no_alloc) {
+        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+            ggml_backend_t             backend = backend_ptrs[i];
+            ggml_backend_buffer_type_t buft    = backend_buft[i];
+
+            const size_t size_exp = backend_buf_exp_size[i];
+            const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+            if (size_exp == size_act) {
+                LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+            } else {
+                LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+            }
+        }
+    }
      ggml_opt_free(opt_ctx);
  }
  
diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp

index 566b039a0703804dd99b6ded66f701f00e94056e..34746c200ca9ca24aabe50946d0cd18034ae9d0a 100644 (file)
--- a/tests/test-grammar-llguidance.cpp
+++ b/tests/test-grammar-llguidance.cpp
@@ -1196,6 +1196,9 @@ int main(int argc, const char ** argv) {
  
      test_sampler_chain();
  
+    llama_free(ctx);
+    llama_model_free(model);
+
      fprintf(stdout, "All tests passed.\n");
      return 0;
  }
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp

index 59dda48772aea9642b505be12a8e1fe1d7b9e247..37f8312c46fe69ceff8965783823f8902a3e0803 100644 (file)
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -300,8 +300,8 @@ int main(int argc, char **argv) {
          fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
      }
  
-    llama_model_free(model);
      llama_free(ctx);
+    llama_model_free(model);
  
      llama_backend_free();
  
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp

index b183da47f3cc841e52b79992b803eec958d0e12c..505dbfdb93d68dd462a7184f0f1bdb8335900642 100644 (file)
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -146,8 +146,8 @@ int main(int argc, char **argv) {
          }
      }
  
-    llama_model_free(model);
      llama_free(ctx);
+    llama_model_free(model);
  
      llama_backend_free();
  
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp

index ba6e94ba8ea57a051469a07faf4e387713185f03..8e370d2c7b4fe6ced8a2f6bf627251fc178a2645 100644 (file)
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -116,8 +116,8 @@ int main(int argc, char ** argv) {
          }
      }
  
-    llama_model_free(model);
      llama_free(ctx);
+    llama_model_free(model);
  
      llama_backend_free();
  
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp

index 2032a386bb4d2c61a1aa5a1fe089ecc46079e2bd..0f627c5ff65ad5da8925523044d8bcf5a9dc178b 100644 (file)
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@@ -55,6 +55,7 @@ int main(int argc, char ** argv) {
  
      if (ctx == NULL) {
          fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        llama_model_free(model);
          return 1;
      }
  
@@ -108,6 +109,8 @@ int main(int argc, char ** argv) {
  
          if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
              LOG_ERR("%s: llama_decode() failed\n", __func__);
+            llama_free(ctx);
+            llama_model_free(model);
              return 1;
          }
      }
@@ -147,6 +150,8 @@ int main(int argc, char ** argv) {
  
                  if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
                      LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(model);
                      return 1;
                  }
  
@@ -165,6 +170,8 @@ int main(int argc, char ** argv) {
                          common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
                          if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                              LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
                              return 1;
                          }
                          llama_memory_seq_rm(mem, 0, pp, -1);
@@ -184,6 +191,8 @@ int main(int argc, char ** argv) {
  
                              if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                                  LOG_ERR("%s: llama_decode() failed\n", __func__);
+                                llama_free(ctx);
+                                llama_model_free(model);
                                  return 1;
                              }
                          }
@@ -200,6 +209,8 @@ int main(int argc, char ** argv) {
  
                          if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                              LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
                              return 1;
                          }
                      }
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp

index 0be6ed69483bef88a86bdc1d7b4c7bf0a0f8cfd6..b431c7f31bf754291168eda6685f522573cfa4f2 100644 (file)
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -2102,6 +2102,8 @@ int main(int argc, char ** argv) {
          struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
          if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
              fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            llama_free(ctx);
+            llama_model_free(lmodel);
              exit(1);
          }
          tpp.strict_cpu = t.cpu_strict;
@@ -2111,6 +2113,8 @@ int main(int argc, char ** argv) {
          struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
          if (!threadpool) {
              fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            llama_free(ctx);
+            llama_model_free(lmodel);
              exit(1);
          }
  
@@ -2126,6 +2130,8 @@ int main(int argc, char ** argv) {
                  bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                  if (!res) {
                      fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                      exit(1);
                  }
              }
@@ -2136,6 +2142,8 @@ int main(int argc, char ** argv) {
                  bool res = test_gen(ctx, 1, t.n_threads);
                  if (!res) {
                      fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                      exit(1);
                  }
              }
@@ -2164,6 +2172,8 @@ int main(int argc, char ** argv) {
                      bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
                      if (!res) {
                          fprintf(stderr, "%s: error: failed to run depth\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
                          exit(1);
                      }
  
@@ -2189,6 +2199,8 @@ int main(int argc, char ** argv) {
                  bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                  if (!res) {
                      fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                      exit(1);
                  }
              }
@@ -2200,6 +2212,8 @@ int main(int argc, char ** argv) {
                  bool res = test_gen(ctx, t.n_gen, t.n_threads);
                  if (!res) {
                      fprintf(stderr, "%s: error: failed to run gen\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                      exit(1);
                  }
              }
author	Johannes Gäßler <redacted>
	Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)
committer	GitHub <redacted>
	Mon, 22 Dec 2025 10:00:37 +0000 (11:00 +0100)
common/common.cpp		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
tests/test-grammar-llguidance.cpp		patch \| blob \| history
tests/test-tokenizer-0.cpp		patch \| blob \| history
tests/test-tokenizer-1-bpe.cpp		patch \| blob \| history
tests/test-tokenizer-1-spm.cpp		patch \| blob \| history
tools/batched-bench/batched-bench.cpp		patch \| blob \| history
tools/llama-bench/llama-bench.cpp		patch \| blob \| history