lora: make sure model keep track of associated adapters (#18490)

author Xuan-Son Nguyen <redacted>

Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)

committer GitHub <redacted>

Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)
author Xuan-Son Nguyen <redacted>
Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)
committer GitHub <redacted>
Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)
diff --git a/include/llama-cpp.h b/include/llama-cpp.h

index 8f6368177de0918e67daa4059e34519eecb42a78..807e77f628071b17003bc9b945927fa5a2654a18 100644 (file)
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -21,7 +21,9 @@ struct llama_sampler_deleter {
  };
  
  struct llama_adapter_lora_deleter {
-    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
+    void operator()(llama_adapter_lora *) {
+        // llama_adapter_lora_free is deprecated
+    }
  };
  
  typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
diff --git a/include/llama.h b/include/llama.h

index 1c17efb9fa1c9f2e2b45cd073704cd35de80e5c8..a25237d20b42e272debf1fca2fec68231e13669b 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -646,7 +646,8 @@ extern "C" {
  
      // Manually free a LoRA adapter
      // NOTE: loaded adapters will be free when the associated model is deleted
-    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
+            "adapters are now freed together with the associated model");
  
      // Get the invocation tokens if the current lora is an alora
      LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp

index bdc24c2d6b1a9daaff2789a4445a8e282f502599..d6a5800e63a7209449ca1c6ef979728e00e5304d 100644 (file)
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
      return nullptr;
  }
  
-static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
      LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  
-    llama_model & model = adapter.model;
-
      ggml_context * ctx_init;
      gguf_init_params meta_gguf_params = {
          /* .no_alloc = */ true,
@@ -413,17 +411,17 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
          }
      }
  
-    // update number of nodes used
-    model.n_lora_nodes += adapter.get_n_nodes();
+    // register adapter with model
+    model.loras.insert(&adapter);
  
      LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  }
  
  llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora(*model);
+    llama_adapter_lora * adapter = new llama_adapter_lora();
  
      try {
-        llama_adapter_lora_init_impl(path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
          return adapter;
      } catch (const std::exception & err) {
          LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -473,12 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
      return snprintf(buf, buf_size, "%s", it->second.c_str());
  }
  
-void llama_adapter_lora_free(llama_adapter_lora * adapter) {
-    // update number of nodes used
-    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
-    adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-
-    delete adapter;
+void llama_adapter_lora_free(llama_adapter_lora *) {
+    // deprecated: adapters are freed by llama_model's destructor
  }
  
  uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
diff --git a/src/llama-adapter.h b/src/llama-adapter.h

index 42d64a6e0b57f6e5ab6fc59fc5e0f133c9221898..d275d25425e85a75ec14527ae47017f90a7936fa 100644 (file)
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
  };
  
  struct llama_adapter_lora {
-    llama_model & model;
-
      // map tensor name to lora_a_b
      std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
  
@@ -75,7 +73,7 @@ struct llama_adapter_lora {
      // activated lora (aLoRA)
      std::vector<llama_token> alora_invocation_tokens;
  
-    llama_adapter_lora(llama_model & model) : model(model) {}
+    llama_adapter_lora() = default;
      ~llama_adapter_lora() = default;
  
      llama_adapter_lora_weight * get_weight(ggml_tensor * w);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index f220010a1b4cce8b422942d3492c8aa20bb451ad..661e7da1687f1427cae1da78775dde963c07fcbc 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1955,7 +1955,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
          return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
      }
      uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    res += model.n_lora_nodes;
+    for (const auto & lora : model.loras) {
+        res += lora->get_n_nodes();
+    }
      return res;
  }
  
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index eaedc66b63ea12a779c78d1573d55e33a5301b7c..94c47dc248024532a6e5627d2284240c30ac78ff 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -468,7 +468,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
      pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  }
  
-llama_model::~llama_model() = default;
+llama_model::~llama_model() {
+    for (auto * lora : loras) {
+        delete lora;
+    }
+}
  
  void llama_model::load_stats(llama_model_loader & ml) {
      pimpl->n_elements = ml.n_elements;
diff --git a/src/llama-model.h b/src/llama-model.h

index 79200a0d97a84cadf82657335561ce9861028e73..d1de16e3f281e35a0a2a2d189066b70f34d6fc46 100644 (file)
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
  #include <memory>
  #include <string>
  #include <unordered_map>
+#include <unordered_set>
  #include <vector>
  
  struct llama_cparams;
@@ -476,8 +477,8 @@ struct llama_model {
      // for quantize-stats only
      std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
  
-    // for keeping track of extra nodes used by lora adapters
-    uint32_t n_lora_nodes = 0;
+    // for keeping track of associated LoRA adapters
+    std::unordered_set<llama_adapter_lora *> loras;
  
      int64_t t_load_us  = 0;
      int64_t t_start_us = 0;
author	Xuan-Son Nguyen <redacted>
	Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)
committer	GitHub <redacted>
	Thu, 15 Jan 2026 09:24:28 +0000 (10:24 +0100)
include/llama-cpp.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-adapter.cpp		patch \| blob \| history
src/llama-adapter.h		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-model.h		patch \| blob \| history