base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight")
+ # models produced by mergekit-extract-lora have token embeddings in the adapter
+ base_name = base_name.replace(".lora_embedding_A", ".weight")
+ base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name
"--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
)
+ parser.add_argument(
+ "--base-model-id", type=str,
+ help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
+ )
parser.add_argument(
"lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path
+ base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors"
lparams: dict[str, Any] = json.load(f)
# load base model
- if dir_base_model is None:
+ if base_model_id is not None:
+ logger.info(f"Loading base model from Hugging Face: {base_model_id}")
+ hparams = load_hparams_from_hf(base_model_id)
+ elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name)
- is_lora_a = ".lora_A.weight" in name
- is_lora_b = ".lora_B.weight" in name
+ # note: mergekit-extract-lora also adds token embeddings to the adapter
+ is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
+ is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name:
continue
+ # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
+ if "_layernorm" in name or ".norm" in name:
+ yield (base_name, tensor)
+ continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest:
+ # mergekit-extract-lora add these layernorm to the adapter
+ if "_norm" in dest_name:
+ assert dest_data.dim() == 1
+ yield (dest_name, dest_data)
+ continue
+
+ # otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()
+ # note: mergekit-extract-lora flip and transpose A and B
+ # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
+ if "token_embd.weight" in dest_name:
+ lora_a = lora_a.T
+
yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)
} else {
ab_map[name].b = cur;
}
+ } else if (str_endswith(name, "_norm.weight")) {
+ // TODO: add support for norm vector
+ // for now, we don't really care because most adapters still work fine without it
+ continue;
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
if (!model_tensor) {
- throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
- if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
- throw std::runtime_error("tensor '" + name + "' has incorrect shape");
- }
- if (w.a->ne[1] != w.b->ne[0]) {
- throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+ if (is_token_embd) {
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ } else {
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ if (w.a->ne[1] != w.b->ne[0]) {
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+ }
}
// save tensor to adapter
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
+ // get actual scale based on rank and alpha
+ float get_scale(float alpha, float adapter_scale) {
+ const float rank = (float) b->ne[0];
+ const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
+ return scale;
+ }
+
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
};
ggml_set_input(lctx.inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
+
+ // apply lora for embedding tokens if needed
+ for (auto & it : lctx.lora_adapters) {
+ struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
+ if (lora == nullptr) {
+ continue;
+ }
+ const float adapter_scale = it.second;
+ const float scale = lora->get_scale(it.first->alpha, adapter_scale);
+ struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
+ ctx, lora->b, // non-transposed lora_b
+ ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
+ ), scale);
+ inpL = ggml_add(ctx, inpL, inpL_delta);
+ }
} else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd;
if (lora == nullptr) {
continue;
}
- const float alpha = it.first->alpha;
- const float rank = (float) lora->b->ne[0];
- const float scale = alpha ? it.second * alpha / rank : it.second;
+ const float adapter_scale = it.second;
+ const float scale = lora->get_scale(it.first->alpha, adapter_scale);
struct ggml_tensor * ab_cur = ggml_mul_mat(
ctx0, lora->b,
ggml_mul_mat(ctx0, lora->a, cur)
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);