From: Georgi Gerganov <redacted>
Date: Wed, 24 May 2023 07:54:45 +0000 (+0300)
Subject: mpt : fix n_ctx (close #165)
X-Git-Tag: upstream/0.0.1642~1444
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=a651d5882a2ff4f1f6c06ed7f0222ce6547c5097;p=pkg%2Fggml%2Fsources%2Fggml

mpt : fix n_ctx (close #165)
---

diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
index 74235aa3..18d3bd3e 100644
--- a/examples/mpt/main.cpp
+++ b/examples/mpt/main.cpp
@@ -197,10 +197,13 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         fin.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
         fin.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
 
+        hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx);
+
         const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
 
         printf("%s: d_model        = %d\n", __func__, hparams.d_model);
         printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
+        printf("%s: n_ctx          = %d\n", __func__, hparams.n_ctx);
         printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
         printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
         printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
@@ -304,30 +307,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 
         model.layers.resize(n_layer);
 
-        model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+        model.wte_weight    = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
         model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
         // map by name
-        model.tensors["transformer.wte.weight"] = model.wte_weight;
+        model.tensors["transformer.wte.weight"]    = model.wte_weight;
         model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
 
         for (int i = 0; i < (int) n_layer; ++i) {
             auto & layer = model.layers[i];
 
-            layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
-            layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd);
-            layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
+            layer.norm_1_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
+            layer.c_attn_wqkv_weight     = ggml_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
+            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
+            layer.norm_2_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
+            layer.ffn_up_proj            = ggml_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
+            layer.ffn_down_proj          = ggml_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
 
             // map by name
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
-                layer.c_attn_out_proj_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"]     = layer.c_attn_wqkv_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"]        = layer.norm_2_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"]   = layer.ffn_up_proj;
             model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
         }
     }
@@ -336,11 +338,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
     {
         const auto & hparams = model.hparams;
 
-        const size_t n_embd = hparams.d_model;
+        const size_t n_embd  = hparams.d_model;
         const size_t n_layer = hparams.n_layers;
 
-        const int64_t n_mem = n_layer * n_ctx;
-        const int64_t n_elements = n_embd * n_mem;
+        const int64_t n_mem      = n_layer * n_ctx;
+        const int64_t n_elements = n_embd  * n_mem;
 
         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
         model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
@@ -690,8 +692,8 @@ int perplexity(mpt_params params) {
     const int64_t t_main_start_us = ggml_time_us();
 
     printf("%s: n_threads = %d\n", __func__, params.n_threads);
-    printf("%s: n_batch = %d\n", __func__, params.n_batch);
-    printf("%s: n_ctx = %d\n", __func__, params.n_ctx);
+    printf("%s: n_batch   = %d\n", __func__, params.n_batch);
+    printf("%s: n_ctx     = %d\n", __func__, params.n_ctx);
     printf("\n");
 
     int64_t t_load_us = 0;
@@ -763,7 +765,7 @@ int perplexity(mpt_params params) {
             const int64_t t_start_us = ggml_time_us();
 
             if (!mpt_eval(model, params.n_threads, j * batch_size, embd, batch_logits, true, mem_per_token)) {
-                printf("Failed to predict\n");
+                printf("%s: failed to evaluate model\n", __func__);
                 return 1;
             }
 
@@ -830,10 +832,9 @@ int perplexity(mpt_params params) {
 
         printf("\n\n");
         printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:  eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f,
-               t_predict_us / 1000.0f / (n_chunk * params.n_ctx) );
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+        printf("%s:     load time = %8.2f ms\n",   __func__, t_load_us / 1000.0f);
+        printf("%s:     eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / (n_chunk * params.n_ctx));
+        printf("%s:    total time = %8.2f ms\n",   __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
     }
 
     ggml_free(model.ctx);
@@ -869,7 +870,6 @@ int main(int argc, char ** argv) {
     printf("%s: n_batch   = %d\n",   __func__, params.n_batch);
     printf("%s: n_ctx     = %d\n",   __func__, params.n_ctx);
     printf("%s: n_predict = %d\n\n", __func__, params.n_predict);
-    printf("\n");
 
     std::mt19937 rng(params.seed);
     if (params.prompt.empty()) {
@@ -924,7 +924,6 @@ int main(int argc, char ** argv) {
 
     for (size_t i = 0; i < embd_inp.size(); i++) {
         printf("%s: token[%lu] = %6d\n", __func__, i, embd_inp[i]);
-        // vocab.id_to_token.at(embd_inp[i]).c_str()
     }
     printf("\n");
 
@@ -945,7 +944,7 @@ int main(int argc, char ** argv) {
             const int64_t t_start_us = ggml_time_us();
 
             if (!mpt_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
-                printf("Failed to predict\n");
+                printf("%s: failed to predict\n", __func__);
                 return 1;
             }
 
@@ -999,7 +998,6 @@ int main(int argc, char ** argv) {
         // display text
         for (auto id : embd) {
            printf("%s", vocab.id_to_token[id].c_str());
-//            printf("[%i]%s", id, vocab.id_to_token[id].c_str());
         }
         fflush(stdout);
 
@@ -1017,10 +1015,8 @@ int main(int argc, char ** argv) {
         printf("%s: sampled tokens = %8d\n", __func__, n_sampled);
         printf("%s:  mem per token = %8zu bytes\n", __func__, mem_per_token);
         printf("%s:      load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:    sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f,
-               t_sample_us / 1000.0f / n_sampled);
-        printf("%s:   eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f,
-               t_predict_us / 1000.0f / n_past);
+        printf("%s:    sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f, t_sample_us / 1000.0f / n_sampled);
+        printf("%s:      eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
         printf("%s:     total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
     }