};
struct llama_adapter_lora_deleter {
- void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
+ void operator()(llama_adapter_lora *) {
+ // llama_adapter_lora_free is deprecated
+ }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
// Manually free a LoRA adapter
// NOTE: loaded adapters will be free when the associated model is deleted
- LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+ LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
+ "adapters are now freed together with the associated model");
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
return nullptr;
}
-static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
- llama_model & model = adapter.model;
-
ggml_context * ctx_init;
gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
}
}
- // update number of nodes used
- model.n_lora_nodes += adapter.get_n_nodes();
+ // register adapter with model
+ model.loras.insert(&adapter);
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
- llama_adapter_lora * adapter = new llama_adapter_lora(*model);
+ llama_adapter_lora * adapter = new llama_adapter_lora();
try {
- llama_adapter_lora_init_impl(path_lora, *adapter);
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
-void llama_adapter_lora_free(llama_adapter_lora * adapter) {
- // update number of nodes used
- GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
- adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-
- delete adapter;
+void llama_adapter_lora_free(llama_adapter_lora *) {
+ // deprecated: adapters are freed by llama_model's destructor
}
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
};
struct llama_adapter_lora {
- llama_model & model;
-
// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
- llama_adapter_lora(llama_model & model) : model(model) {}
+ llama_adapter_lora() = default;
~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
- res += model.n_lora_nodes;
+ for (const auto & lora : model.loras) {
+ res += lora->get_n_nodes();
+ }
return res;
}
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
}
-llama_model::~llama_model() = default;
+llama_model::~llama_model() {
+ for (auto * lora : loras) {
+ delete lora;
+ }
+}
void llama_model::load_stats(llama_model_loader & ml) {
pimpl->n_elements = ml.n_elements;
#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include <vector>
struct llama_cparams;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
- // for keeping track of extra nodes used by lora adapters
- uint32_t n_lora_nodes = 0;
+ // for keeping track of associated LoRA adapters
+ std::unordered_set<llama_adapter_lora *> loras;
int64_t t_load_us = 0;
int64_t t_start_us = 0;