int32_t n_head = 32; // model.config.num_attention_heads
int32_t n_layer = 32; // model.config.num_hidden_layers
int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
+ int32_t par_res = 1; // 1 = true, 0 = false
int32_t ftype = GGML_FTYPE_MOSTLY_F16;
};
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
+ fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
printf("%s: qntvr = %d\n", __func__, qntvr);
return true;
}
+// feed-forward network
+ggml_tensor * gpt_neox_ff(
+ const dollyv2_layer &layer,
+ ggml_context * ctx0,
+ ggml_tensor * inp) {
+ ggml_tensor * cur = ggml_norm(ctx0, inp);
+
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ ggml_repeat(ctx0, layer.ln_2_g, cur),
+ cur),
+ ggml_repeat(ctx0, layer.ln_2_b, cur));
+
+ cur = ggml_mul_mat(ctx0,
+ layer.c_mlp_fc_w,
+ cur);
+
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
+ cur);
+
+ // GELU activation
+ cur = ggml_gelu(ctx0, cur);
+
+ // projection
+ // cur = proj_w*cur + proj_b
+ cur = ggml_mul_mat(ctx0,
+ layer.c_mlp_proj_w,
+ cur);
+
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
+ cur);
+ return cur;
+}
+
// evaluate the transformer
//
// - model: the model
}
}
- struct ggml_tensor * inpFF = cur;
-
- // feed-forward network
- // this is independent of the self-attention result, so it could be done in parallel to the self-attention
- {
- // post attention layer norm
- // note here we pass inpL instead of cur
- {
- cur = ggml_norm(ctx0, inpL);
+ if (hparams.par_res == 0) {
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
- cur),
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
- }
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpFF);
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_mlp_fc_w,
- cur);
+ // input for next layer
+ inpL = ggml_add(ctx0, cur, inpFF);
+ } else {
+ struct ggml_tensor * inpFF = cur;
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
- cur);
+ // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+ // note here we pass inpL instead of cur
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpL);
- // GELU activation
- cur = ggml_gelu(ctx0, cur);
+ // layer input + FF
+ cur = ggml_add(ctx0, cur, inpFF);
- // projection
- // cur = proj_w*cur + proj_b
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_mlp_proj_w,
- cur);
-
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
- cur);
+ // input for next layer
+ inpL = ggml_add(ctx0, cur, inpL);
}
-
- // layer input + FF
- cur = ggml_add(ctx0, cur, inpFF);
-
- // input for next layer
- inpL = ggml_add(ctx0, cur, inpL);
+
}
// norm
#include <vector>
#include <regex>
-// default hparams (StableLM 3B)
-struct stablelm_hparams {
- int32_t n_vocab = 50257;
- int32_t n_ctx = 4096;
- int32_t n_embd = 4096;
- int32_t n_head = 32;
- int32_t n_layer = 16;
- int32_t n_rot = 32; // 0.25 * (n_embd / n_head)
- int32_t ftype = 1;
+// default hparams (dollyv2 3B)
+struct dollyv2_hparams {
+ int32_t n_vocab = 50254; // tokenizer.vocab_size
+ int32_t n_ctx = 2048; // model.config.max_position_embeddings
+ int32_t n_embd = 2560; // model.config.hidden_size
+ int32_t n_head = 32; // model.config.num_attention_heads
+ int32_t n_layer = 32; // model.config.num_hidden_layers
+ int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
+ int32_t par_res = 1; // 1 = true, 0 = false
+ int32_t ftype = GGML_FTYPE_MOSTLY_F16;
};
// quantize a model
-bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &magic, sizeof(magic));
}
- stablelm_hparams hparams;
+ dollyv2_hparams hparams;
// load hparams
{
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
+ finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
+ fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
}
// usage:
-// ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
+// ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
if (argc != 4) {
{
const int64_t t_start_us = ggml_time_us();
- if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+ if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}