From: Georgi Gerganov <redacted>
Date: Fri, 20 Oct 2023 07:05:28 +0000 (+0300)
Subject: gpt-2 : add ignore-eos flag
X-Git-Tag: upstream/0.0.1642~1213
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f25addb9e88853648c60b96fd73ab95709d25e53;p=pkg%2Fggml%2Fsources%2Fggml

gpt-2 : add ignore-eos flag
---

diff --git a/examples/common.cpp b/examples/common.cpp
index d55708ad..603c655a 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -38,8 +38,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
         } else if (arg == "-t" || arg == "--threads") {
             params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
         } else if (arg == "-p" || arg == "--prompt") {
             params.prompt = get_next_arg(i, argc, argv, arg, params);
         } else if (arg == "-n" || arg == "--n_predict") {
@@ -60,6 +58,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
         } else if (arg == "-c" || arg == "--context") {
             params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
+            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--ignore-eos") {
+            params.ignore_eos = true;
         } else if (arg == "-m" || arg == "--model") {
             params.model = get_next_arg(i, argc, argv, arg, params);
         } else if (arg == "-i" || arg == "--interactive") {
@@ -101,7 +103,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
@@ -116,6 +117,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  -c N, --context N     context / KV cache size (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --ignore-eos          ignore EOS token during generation\n");
+    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
diff --git a/examples/common.h b/examples/common.h
index 635afc24..1d4e9c37 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -15,12 +15,15 @@
 //
 
 struct gpt_params {
-    int32_t seed       = -1;   // RNG seed
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict  = 200;  // new tokens to predict
-    int32_t n_parallel = 1;    // number of parallel streams
-    int32_t n_batch    = 8;    // batch size for prompt processing
-    int32_t n_ctx      = 2048; // context size (this is the KV cache max size)
+    int32_t seed         = -1;   // RNG seed
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict    = 200;  // new tokens to predict
+    int32_t n_parallel   = 1;    // number of parallel streams
+    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
+    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
+
+    bool ignore_eos = false; // ignore EOS token when generating text
 
     // sampling parameters
     int32_t top_k          = 40;
@@ -35,8 +38,6 @@ struct gpt_params {
 
     bool    interactive      = false;
     int32_t interactive_port = -1;
-
-    int32_t n_gpu_layers     = 0;
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp
index 1c499bd5..76066ce8 100644
--- a/examples/gpt-2/main-batched.cpp
+++ b/examples/gpt-2/main-batched.cpp
@@ -1136,7 +1136,7 @@ int main(int argc, char ** argv) {
             }
 
             // is it an end of stream? -> mark the stream as finished
-            if (id == 50256 || n_cur == n_len - 1) {
+            if ((!params.ignore_eos && id == 50256) || n_cur == n_len - 1) {
                 i_batch[i] = -1;
                 printf("\n");
                 if (n_parallel > 1) {
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 5702fd8e..514f299a 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -975,7 +975,7 @@ int main(int argc, char ** argv) {
         fflush(stdout);
 
         // end of text token
-        if (embd.back() == 50256) {
+        if (!params.ignore_eos && embd.back() == 50256) {
             break;
         }
     }