struct ggml_tensor * ffn_norm;
// ff
- struct ggml_tensor * w1;
- struct ggml_tensor * w2;
- struct ggml_tensor * w3;
+ struct ggml_tensor * ffn_gate; // w1
+ struct ggml_tensor * ffn_down; // w2
+ struct ggml_tensor * ffn_up; // w3
};
struct my_llama_model {
uint32_t n_rank_wv = 4;
uint32_t n_rank_wo = 4;
uint32_t n_rank_ffn_norm = 1;
- uint32_t n_rank_w1 = 4;
- uint32_t n_rank_w2 = 4;
- uint32_t n_rank_w3 = 4;
+ uint32_t n_rank_ffn_gate = 4;
+ uint32_t n_rank_ffn_down = 4;
+ uint32_t n_rank_ffn_up = 4;
uint32_t n_rank_tok_embeddings = 4;
uint32_t n_rank_norm = 1;
uint32_t n_rank_output = 4;
struct ggml_tensor * ffn_norm_b;
// ff
- struct ggml_tensor * w1_a;
- struct ggml_tensor * w1_b;
- struct ggml_tensor * w2_a;
- struct ggml_tensor * w2_b;
- struct ggml_tensor * w3_a;
- struct ggml_tensor * w3_b;
+ struct ggml_tensor * ffn_gate_a;
+ struct ggml_tensor * ffn_gate_b;
+ struct ggml_tensor * ffn_down_a;
+ struct ggml_tensor * ffn_down_b;
+ struct ggml_tensor * ffn_up_a;
+ struct ggml_tensor * ffn_up_b;
};
struct my_llama_lora {
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
- printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1);
- printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2);
- printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3);
+ printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate);
+ printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down);
+ printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up);
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
- layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
- layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
- layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
+ layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
+ layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
+ layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
assert_shape_1d(layer.attention_norm, hparams.n_embd);
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
- assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
- assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd);
- assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff);
+ assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff);
+ assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd);
+ assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff);
}
}
ggml_set_param(ctx, layer.wo_b);
ggml_set_param(ctx, layer.ffn_norm_a);
ggml_set_param(ctx, layer.ffn_norm_b);
- ggml_set_param(ctx, layer.w1_a);
- ggml_set_param(ctx, layer.w1_b);
- ggml_set_param(ctx, layer.w2_a);
- ggml_set_param(ctx, layer.w2_b);
- ggml_set_param(ctx, layer.w3_a);
- ggml_set_param(ctx, layer.w3_b);
+ ggml_set_param(ctx, layer.ffn_gate_a);
+ ggml_set_param(ctx, layer.ffn_gate_b);
+ ggml_set_param(ctx, layer.ffn_down_a);
+ ggml_set_param(ctx, layer.ffn_down_b);
+ ggml_set_param(ctx, layer.ffn_up_a);
+ ggml_set_param(ctx, layer.ffn_up_b);
}
}
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
- layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
- layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
- layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
- layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
- layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
- layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
+ layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
+ layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
+ layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
+ layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
+ layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd);
+ layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff);
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
- ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
- ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
- ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
- ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
- ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
- ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
+ ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
+ ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
+ ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
+ ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
+ ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
+ ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
}
set_param_lora(lora);
randomize_tensor_normal(layer.ffn_norm_a, rnd);
ggml_set_zero(layer.ffn_norm_b);
- randomize_tensor_normal(layer.w1_a, rnd);
- ggml_set_zero(layer.w1_b);
- randomize_tensor_normal(layer.w2_a, rnd);
- ggml_set_zero(layer.w2_b);
- randomize_tensor_normal(layer.w3_a, rnd);
- ggml_set_zero(layer.w3_b);
+ randomize_tensor_normal(layer.ffn_gate_a, rnd);
+ ggml_set_zero(layer.ffn_gate_b);
+ randomize_tensor_normal(layer.ffn_down_a, rnd);
+ ggml_set_zero(layer.ffn_down_b);
+ randomize_tensor_normal(layer.ffn_up_a, rnd);
+ ggml_set_zero(layer.ffn_up_b);
}
free_random_normal_distribution(rnd);
struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
- struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
- struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
- struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
- struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
- struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
- struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
- struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+ struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+ struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+ struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+ struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
+ struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
+ struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
+ struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
- struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
- struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
+ struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
+ struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
- struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
+ struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30;
if (enable_checkpointing) {
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
- ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
- ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
- ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
}
// allocating checkpoints in one block to reduce memory fragmentation
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
- GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
- GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
- GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
init_lora(model, lora);
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
- copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a));
- copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b));
- copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a));
- copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b));
- copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a));
- copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b));
+ copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
+ copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
+ copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
+ copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
+ copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
+ copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
}
}
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
- gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1);
- gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2);
- gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3);
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate);
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down);
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up);
gguf_add_tensor(fctx, lora->tok_embeddings_a);
gguf_add_tensor(fctx, lora->tok_embeddings_b);
gguf_add_tensor(fctx, layer.wo_b);
gguf_add_tensor(fctx, layer.ffn_norm_a);
gguf_add_tensor(fctx, layer.ffn_norm_b);
- gguf_add_tensor(fctx, layer.w1_a);
- gguf_add_tensor(fctx, layer.w1_b);
- gguf_add_tensor(fctx, layer.w2_a);
- gguf_add_tensor(fctx, layer.w2_b);
- gguf_add_tensor(fctx, layer.w3_a);
- gguf_add_tensor(fctx, layer.w3_b);
+ gguf_add_tensor(fctx, layer.ffn_gate_a);
+ gguf_add_tensor(fctx, layer.ffn_gate_b);
+ gguf_add_tensor(fctx, layer.ffn_down_a);
+ gguf_add_tensor(fctx, layer.ffn_down_b);
+ gguf_add_tensor(fctx, layer.ffn_up_a);
+ gguf_add_tensor(fctx, layer.ffn_up_b);
}
}
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
- write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
- write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
- write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
- write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
- write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
- write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
+ write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
+ write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
+ write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
+ write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
+ write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
+ write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
}
}
uint32_t n_rank_wv;
uint32_t n_rank_wo;
uint32_t n_rank_ffn_norm;
- uint32_t n_rank_w1;
- uint32_t n_rank_w2;
- uint32_t n_rank_w3;
+ uint32_t n_rank_ffn_gate;
+ uint32_t n_rank_ffn_down;
+ uint32_t n_rank_ffn_up;
uint32_t n_rank_tok_embeddings;
uint32_t n_rank_norm;
uint32_t n_rank_output;
bool custom_n_rank_wv;
bool custom_n_rank_wo;
bool custom_n_rank_ffn_norm;
- bool custom_n_rank_w1;
- bool custom_n_rank_w2;
- bool custom_n_rank_w3;
+ bool custom_n_rank_ffn_gate;
+ bool custom_n_rank_ffn_down;
+ bool custom_n_rank_ffn_up;
bool custom_n_rank_tok_embeddings;
bool custom_n_rank_norm;
bool custom_n_rank_output;
params.n_rank_wv = 4;
params.n_rank_wo = 4;
params.n_rank_ffn_norm = 1;
- params.n_rank_w1 = 4;
- params.n_rank_w2 = 4;
- params.n_rank_w3 = 4;
+ params.n_rank_ffn_gate = 4;
+ params.n_rank_ffn_down = 4;
+ params.n_rank_ffn_up = 4;
params.n_rank_tok_embeddings = 4;
params.n_rank_norm = 1;
params.n_rank_output = 4;
params.custom_n_rank_wv = false;
params.custom_n_rank_wo = false;
params.custom_n_rank_ffn_norm = false;
- params.custom_n_rank_w1 = false;
- params.custom_n_rank_w2 = false;
- params.custom_n_rank_w3 = false;
+ params.custom_n_rank_ffn_gate = false;
+ params.custom_n_rank_ffn_down = false;
+ params.custom_n_rank_ffn_up = false;
params.custom_n_rank_tok_embeddings = false;
params.custom_n_rank_norm = false;
params.custom_n_rank_output = false;
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
- fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n");
- fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n");
- fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n");
+ fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n");
+ fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n");
+ fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n");
print_common_train_usage(argc, argv, ¶ms->common);
}
}
params->n_rank_wo = std::stoi(argv[i]);
params->custom_n_rank_wo = true;
- } else if (arg == "--rank-w1") {
+ } else if (arg == "--rank-ffn_gate") {
if (++i >= argc) {
invalid_param = true;
break;
}
- params->n_rank_w1 = std::stoi(argv[i]);
- params->custom_n_rank_w1 = true;
- } else if (arg == "--rank-w2") {
+ params->n_rank_ffn_gate = std::stoi(argv[i]);
+ params->custom_n_rank_ffn_gate = true;
+ } else if (arg == "--rank-ffn_down") {
if (++i >= argc) {
invalid_param = true;
break;
}
- params->n_rank_w2 = std::stoi(argv[i]);
- params->custom_n_rank_w2 = true;
- } else if (arg == "--rank-w3") {
+ params->n_rank_ffn_down = std::stoi(argv[i]);
+ params->custom_n_rank_ffn_down = true;
+ } else if (arg == "--rank-ffn_up") {
if (++i >= argc) {
invalid_param = true;
break;
}
- params->n_rank_w3 = std::stoi(argv[i]);
- params->custom_n_rank_w3 = true;
+ params->n_rank_ffn_up = std::stoi(argv[i]);
+ params->custom_n_rank_ffn_up = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params);
nx += ggml_nelements(layer.wo_b);
nx += ggml_nelements(layer.ffn_norm_a);
nx += ggml_nelements(layer.ffn_norm_b);
- nx += ggml_nelements(layer.w1_a);
- nx += ggml_nelements(layer.w1_b);
- nx += ggml_nelements(layer.w2_a);
- nx += ggml_nelements(layer.w2_b);
- nx += ggml_nelements(layer.w3_a);
- nx += ggml_nelements(layer.w3_b);
+ nx += ggml_nelements(layer.ffn_gate_a);
+ nx += ggml_nelements(layer.ffn_gate_b);
+ nx += ggml_nelements(layer.ffn_down_a);
+ nx += ggml_nelements(layer.ffn_down_b);
+ nx += ggml_nelements(layer.ffn_up_a);
+ nx += ggml_nelements(layer.ffn_up_b);
}
return nx;
}
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
- uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r;
- uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r;
- uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r;
+ uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r;
+ uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r;
+ uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r;
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
lora.hparams.n_rank_wv = n_rank_wv;
lora.hparams.n_rank_wo = n_rank_wo;
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
- lora.hparams.n_rank_w1 = n_rank_w1;
- lora.hparams.n_rank_w2 = n_rank_w2;
- lora.hparams.n_rank_w3 = n_rank_w3;
+ lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate;
+ lora.hparams.n_rank_ffn_down = n_rank_ffn_down;
+ lora.hparams.n_rank_ffn_up = n_rank_ffn_up;
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
lora.hparams.n_rank_norm = n_rank_norm;
lora.hparams.n_rank_output = n_rank_output;
|| (lora.hparams.n_rank_wv != n_rank_wv)
|| (lora.hparams.n_rank_wo != n_rank_wo)
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
- || (lora.hparams.n_rank_w1 != n_rank_w1)
- || (lora.hparams.n_rank_w2 != n_rank_w2)
- || (lora.hparams.n_rank_w3 != n_rank_w3)
+ || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate)
+ || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down)
+ || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up)
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|| (lora.hparams.n_rank_norm != n_rank_norm)
|| (lora.hparams.n_rank_output != n_rank_output)
struct ggml_tensor * ffn_norm;
// ff
- struct ggml_tensor * w1;
- struct ggml_tensor * w2;
- struct ggml_tensor * w3;
+ struct ggml_tensor * ffn_gate; // w1
+ struct ggml_tensor * ffn_down; // w2
+ struct ggml_tensor * ffn_up; // w3
};
struct my_llama_model {
ggml_set_param(ctx, layer.wv);
ggml_set_param(ctx, layer.wo);
ggml_set_param(ctx, layer.ffn_norm);
- ggml_set_param(ctx, layer.w1);
- ggml_set_param(ctx, layer.w2);
- ggml_set_param(ctx, layer.w3);
+ ggml_set_param(ctx, layer.ffn_gate);
+ ggml_set_param(ctx, layer.ffn_down);
+ ggml_set_param(ctx, layer.ffn_up);
}
}
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
- layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
- layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
- layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+ layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+ layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+ layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
- ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i));
- ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
- ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
+ ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
+ ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
+ ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
}
set_param_model(model);
randomize_tensor_normal(layer.ffn_norm, rnd);
- randomize_tensor_normal(layer.w1, rnd);
- randomize_tensor_normal(layer.w2, rnd);
- randomize_tensor_normal(layer.w3, rnd);
+ randomize_tensor_normal(layer.ffn_gate, rnd);
+ randomize_tensor_normal(layer.ffn_down, rnd);
+ randomize_tensor_normal(layer.ffn_up, rnd);
}
free_random_normal_distribution(rnd);
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
- struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
- struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
+ struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
+ struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
- struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
+ struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30;
checkpoints.push_back(cur);
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
- copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
- copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
- copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+ copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
+ copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
+ copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
}
}
gguf_add_tensor(fctx, layer.wv);
gguf_add_tensor(fctx, layer.wo);
gguf_add_tensor(fctx, layer.ffn_norm);
- gguf_add_tensor(fctx, layer.w1);
- gguf_add_tensor(fctx, layer.w2);
- gguf_add_tensor(fctx, layer.w3);
+ gguf_add_tensor(fctx, layer.ffn_gate);
+ gguf_add_tensor(fctx, layer.ffn_down);
+ gguf_add_tensor(fctx, layer.ffn_up);
}
}
nx += ggml_nelements(layer.wv);
nx += ggml_nelements(layer.wo);
nx += ggml_nelements(layer.ffn_norm);
- nx += ggml_nelements(layer.w1);
- nx += ggml_nelements(layer.w2);
- nx += ggml_nelements(layer.w3);
+ nx += ggml_nelements(layer.ffn_gate);
+ nx += ggml_nelements(layer.ffn_down);
+ nx += ggml_nelements(layer.ffn_up);
}
return nx;
}