llama : re-enable manual LoRA adapter free (#19983)

author Pop Flamingo <redacted>

Wed, 18 Mar 2026 10:03:26 +0000 (11:03 +0100)

committer GitHub <redacted>

Wed, 18 Mar 2026 10:03:26 +0000 (12:03 +0200)
author Pop Flamingo <redacted>
Wed, 18 Mar 2026 10:03:26 +0000 (11:03 +0100)
committer GitHub <redacted>
Wed, 18 Mar 2026 10:03:26 +0000 (12:03 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index cc423d3439fc7ace8d55b04fba0068f15376626f..59d75a3b95ccd2d5f7a4e473206017eaae7e11b7 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1067,7 +1067,7 @@ common_init_result::common_init_result(common_params & params) :
  
      const llama_vocab * vocab = llama_model_get_vocab(model);
  
-    // load and optionally apply lora adapters (must be loaded before context creation)
+    // load and optionally apply lora adapters
      for (auto & la : params.lora_adapters) {
          llama_adapter_lora_ptr lora;
          lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
diff --git a/include/llama-cpp.h b/include/llama-cpp.h

index 807e77f628071b17003bc9b945927fa5a2654a18..8f6368177de0918e67daa4059e34519eecb42a78 100644 (file)
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
  };
  
  struct llama_adapter_lora_deleter {
-    void operator()(llama_adapter_lora *) {
-        // llama_adapter_lora_free is deprecated
-    }
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
  };
  
  typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
diff --git a/include/llama.h b/include/llama.h

index c6e102abe519f0c9393fe7903f61ad9a6f2d1641..6e72db7e3cd177c7f3acd3e8ef51e83ffdb9377a 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -636,7 +636,6 @@ extern "C" {
  
      // Load a LoRA adapter from file
      // The adapter is valid as long as the associated model is not freed
-    // All adapters must be loaded before context creation
      LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
              struct llama_model * model,
              const char * path_lora);
@@ -660,9 +659,8 @@ extern "C" {
      LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
  
      // Manually free a LoRA adapter
-    // NOTE: loaded adapters will be free when the associated model is deleted
-    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
-            "adapters are now freed together with the associated model");
+    // NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
  
      // Get the invocation tokens if the current lora is an alora
      LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp

index d6a5800e63a7209449ca1c6ef979728e00e5304d..2f2cc12af04ea7eca827bf9dd1a6c5f007354755 100644 (file)
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -418,7 +418,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
  }
  
  llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora();
+    llama_adapter_lora * adapter = new llama_adapter_lora(model);
  
      try {
          llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -471,8 +471,17 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
      return snprintf(buf, buf_size, "%s", it->second.c_str());
  }
  
-void llama_adapter_lora_free(llama_adapter_lora *) {
-    // deprecated: adapters are freed by llama_model's destructor
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
+    if (adapter == nullptr) {
+        return;
+    }
+
+    if (adapter->model != nullptr) {
+        adapter->model->loras.erase(adapter);
+        adapter->model = nullptr;
+    }
+
+    delete adapter;
  }
  
  uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
diff --git a/src/llama-adapter.h b/src/llama-adapter.h

index aa3ab63ad7531c737b371eca3e12e78ec9c7476b..f0b1e50f81695e9b1ef11bbd2db277fc1a921add 100644 (file)
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -61,6 +61,8 @@ struct llama_adapter_lora_weight {
  };
  
  struct llama_adapter_lora {
+    llama_model * model = nullptr;
+
      // map tensor name to lora_a_b
      std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
  
@@ -75,7 +77,7 @@ struct llama_adapter_lora {
      // activated lora (aLoRA)
      std::vector<llama_token> alora_invocation_tokens;
  
-    llama_adapter_lora() = default;
+    explicit llama_adapter_lora(llama_model * model) : model(model) {}
      ~llama_adapter_lora() = default;
  
      llama_adapter_lora_weight * get_weight(ggml_tensor * w);
author	Pop Flamingo <redacted>
	Wed, 18 Mar 2026 10:03:26 +0000 (11:03 +0100)
committer	GitHub <redacted>
	Wed, 18 Mar 2026 10:03:26 +0000 (12:03 +0200)
common/common.cpp		patch \| blob \| history
include/llama-cpp.h		patch \| blob \| history
include/llama.h		patch \| blob \| history
src/llama-adapter.cpp		patch \| blob \| history
src/llama-adapter.h		patch \| blob \| history