From: Lukas Möller Date: Sun, 18 Jun 2023 08:34:21 +0000 (+0200) Subject: replit : update inference code to match reference (#218) X-Git-Tag: upstream/0.0.1642~1404 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=801b33ba68a233aff8c51a6ece0bcadb205e4812;p=pkg%2Fggml%2Fsources%2Fggml replit : update inference code to match reference (#218) * Update replit inference code to match reference * Add qntvr printf --- diff --git a/examples/replit/convert-h5-to-ggml.py b/examples/replit/convert-h5-to-ggml.py index 310074b1..4fc15a97 100644 --- a/examples/replit/convert-h5-to-ggml.py +++ b/examples/replit/convert-h5-to-ggml.py @@ -73,6 +73,10 @@ for piece in sp_proto.pieces: fout.write(encoded_piece) fout.write(struct.pack("f", piece.score)) +if hparams["vocab_size"] > len(sp_proto.pieces): + for i in range(hparams["vocab_size"] - len(sp_proto.pieces)): + fout.write(struct.pack("i", 0)) + fout.write(struct.pack("f", 0)) for name in list_vars.keys(): data = list_vars[name].squeeze().numpy() diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index ae8ebf90..710f1eea 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -8,11 +8,12 @@ #include #include #include -#include - #include +#include #include +#include #include +#include #include #include #include @@ -78,7 +79,7 @@ bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int fin.read((char *)&len, sizeof(len)); buf.resize(len); - fin.read((char *) buf.data(), len); + fin.read((char *)buf.data(), len); word.assign(buf.data(), len); float score; @@ -127,38 +128,36 @@ std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std: } // no defaults for now -struct mpt_hparams { - int32_t d_model = 0; +struct replit_hparams { + int32_t d_model = 0; int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - int32_t ftype = 0; + int32_t n_heads = 0; + int32_t n_layers = 0; + int32_t n_vocab = 0; + int32_t ftype = 0; }; struct replit_layer { // pre normalization - struct ggml_tensor * ln_1_weight; + struct ggml_tensor * norm_1_weight; // attention struct ggml_tensor * c_attn_wqkv_weight; - struct ggml_tensor * c_attn_out_proj_weight; // post normalization - struct ggml_tensor * ln_2_weight; + struct ggml_tensor * norm_2_weight; // ff - struct ggml_tensor * c_mlp_mlp_up_weight; - - struct ggml_tensor * c_mlp_mlp_down_weight; + struct ggml_tensor * ffn_up_proj; + struct ggml_tensor * ffn_down_proj; }; struct replit_model { - mpt_hparams hparams; + replit_hparams hparams; - struct ggml_tensor * wte_weight; // position embedding - struct ggml_tensor * ln_f_weight; // language model head + struct ggml_tensor * wte_weight; // position embedding + struct ggml_tensor * norm_f_weight; // language model head std::vector layers; @@ -194,22 +193,22 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t { auto & hparams = model.hparams; - fin.read((char *) &hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + fin.read((char *)&hparams.d_model, sizeof(hparams.d_model)); + fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); + fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); + fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *)&hparams.ftype, sizeof(hparams.ftype)); const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); + printf("%s: d_model = %d\n", __func__, hparams.d_model); + printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); + printf("%s: n_heads = %d\n", __func__, hparams.n_heads); + printf("%s: n_layers = %d\n", __func__, hparams.n_layers); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); hparams.ftype %= GGML_QNT_VERSION_FACTOR; } @@ -276,38 +275,37 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t { const auto & hparams = model.hparams; - const int n_embd = hparams.d_model; - const int n_layer = hparams.n_layers; - const int n_vocab = hparams.n_vocab; + const size_t n_embd = hparams.d_model; + const size_t n_layer = hparams.n_layers; + const size_t n_vocab = hparams.n_vocab; model.layers.resize(n_layer); model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.ln_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // map by name model.tensors["transformer.wte.weight"] = model.wte_weight; - model.tensors["transformer.ln_f.weight"] = model.ln_f_weight; + model.tensors["transformer.norm_f.weight"] = model.norm_f_weight; - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < (int)n_layer; ++i) { auto & layer = model.layers[i]; - layer.ln_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd); layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.ln_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.c_mlp_mlp_up_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd); - layer.c_mlp_mlp_down_weight = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd); + layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd); + layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd); // map by name - model.tensors["transformer.blocks." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_weight; + model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight; model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight; model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".ln_2.weight"] = layer.ln_2_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".mlp.mlp_up.weight"] = layer.c_mlp_mlp_up_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".mlp.mlp_down.weight"] = - layer.c_mlp_mlp_down_weight; + model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight; + model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj; + model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj; } } @@ -327,7 +325,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); + printf("%s: memory_size = %8.2f MB, n_mem = %lld\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); } // load weights @@ -423,16 +421,17 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t // - embd_w: the predicted logits for the next token // bool replit_eval(const replit_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token) { + const std::vector & embd_inp, std::vector & embd_w, bool logits_all, + size_t & mem_per_token) { const int N = embd_inp.size(); const auto & hparams = model.hparams; const int n_embd = hparams.d_model; const int n_layer = hparams.n_layers; - const int n_ctx = hparams.max_seq_len; const int n_head = hparams.n_heads; const int n_vocab = hparams.n_vocab; + const int n_ctx = hparams.max_seq_len; static size_t buf_size = 256u * 1024 * 1024; static void * buf = malloc(buf_size); @@ -474,7 +473,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa { cur = ggml_norm(ctx0, inpL); - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur); + cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur); } // self-attention @@ -482,9 +481,8 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // attn_bias=attn_bias, attention_mask=attention_mask, // is_causal=is_causal) { - // compute QKV - { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur); } + cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur); struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd); struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd); @@ -525,7 +523,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head))); - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0); + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); @@ -564,20 +562,20 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa { cur = ggml_norm(ctx0, inpL); - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur); + cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur); } // n = self.mlp(m) { - cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_mlp_up_weight, cur); + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur); // GELU activation cur = ggml_gelu(ctx0, cur); // projection // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_mlp_down_weight, cur); + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur); } // x = x + n @@ -588,7 +586,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa { inpL = ggml_norm(ctx0, inpL); // inpL = ln_f_g*inpL - inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL); + inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); } // output embedding weight tied to input embedding @@ -606,12 +604,18 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // if (n_past%100 == 0) { // ggml_graph_print(&gf); - // ggml_graph_dump_dot(&gf, NULL, "replit-model.dot"); + // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot"); // } - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); + if (logits_all) { + // return result for all tokens + embd_w.resize(n_vocab * N); + memcpy(embd_w.data(), (float *)ggml_get_data(inpL), sizeof(float) * n_vocab * N); + } else { + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); + } if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0) / N; @@ -624,8 +628,6 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa } int main(int argc, char ** argv) { - ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); gpt_params params; @@ -643,7 +645,14 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); + if (!isatty(STDIN_FILENO)) { + std::string line; + while (std::getline(std::cin, line)) { + params.prompt = params.prompt + "\n" + line; + } + } else { + params.prompt = gpt_random_prompt(rng); + } } int64_t t_load_us = 0; @@ -687,14 +696,14 @@ int main(int argc, char ** argv) { // determine the required inference memory per token: size_t mem_per_token = 0; - replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); + replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token); for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { // predict if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); - if (!replit_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + if (!replit_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) { printf("Failed to predict\n"); return 1; }