From: jaeminSon Date: Sat, 27 May 2023 08:47:34 +0000 (+0900) Subject: examples : add tokenization tests and refactor codes (#186) X-Git-Tag: upstream/0.0.1642~1434 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=765c9bce37a53c36da4723e9f014874c27b69aa1;p=pkg%2Fggml%2Fsources%2Fggml examples : add tokenization tests and refactor codes (#186) * examples : [refactor] remove unnecessary lines and segments * examples : [feature] add tokenization test for gpt-neox * examples : [feature] handle multibyte character set * examples : [refactor] find the longest token for word * examples : [refactor] move test_tokenizer to common.cpp as the function affects other models * add 'test_tokenizer' function after loading the model * examples : [feature] add test cases for checking tokenization * examples : [feature] tokenize with huggingface tokenizers for currently supported models * examples : add tokenization test cases for each model * revert conversion from string to utf-8 encoded byte strings * [refactor] make util functions for testing tokenizers available * [bug fix] test replit using functions and variables (e.g. tokenizer struct, tokenization method) defined in its main.cpp * [refactor] modify function name test_tokenizer -> test_gpt_tokenizer * [refactor] put parenthesis on single line for-loops and if-statements * [refactor] withdraw and use and * [refactor] remove 'find_test_file' function and directly set test file path from 'test_gpt_tokenizer' function * call a function for testing tokenizer with filename specified * revert test tokenizer in replit (replit uses seperate methods for tokenzation and decoding) * compare vector of id to check if two tokenizations are identical. * write token ids instead of strings. * [refactor] use --token_test rather than --test for token-test argument * add english test cases * update test cases with more english prompts * examples : tokenizer testing fixes --------- Co-authored-by: Georgi Gerganov --- diff --git a/examples/common.cpp b/examples/common.cpp index e30f524e..29082f60 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -54,7 +55,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else { + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = argv[++i]; + } + else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, params); exit(0); @@ -75,6 +79,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " prompt to start generation with (default: random)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); @@ -219,6 +225,7 @@ std::string convert_to_utf8(const std::wstring & input) { return converter.to_bytes(input); } + std::wstring convert_to_wstring(const std::string & input) { std::wstring_convert> converter; return converter.from_bytes(input); @@ -257,43 +264,94 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri } } - // find the longest tokens that form the words: + // find the longest token that forms each word in words: std::vector tokens; for (const auto & word : words) { - if (word.size() == 0) continue; - - int i = 0; - int n = word.size(); - while (i < n) { - int j = n; - while (j > i) { - auto it = vocab.token_to_id.find(word.substr(i, j-i)); - if (it != vocab.token_to_id.end()) { + for (int i = 0; i < word.size(); ){ + for (int j = word.size() - 1; j >= i; j--){ + auto cand = word.substr(i, j-i+1); + auto it = vocab.token_to_id.find(cand); + if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab tokens.push_back(it->second); - i = j; - j = n; - continue; + i = j + 1; + break; } - --j; - } - if (i == n) { - break; - } - if (j == i) { - auto sub = word.substr(i, 1); - if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { - tokens.push_back(vocab.token_to_id.at(sub)); - } else { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); + else if (j == i){ // word.substr(i, 1) has no matching + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + i++; } - ++i; } } } + return tokens; } +std::vector parse_tokens_from_string(const std::string& input, char delimiter) { + std::vector output; + std::stringstream ss(input); + std::string token; + + while (std::getline(ss, token, delimiter)) { + output.push_back(std::stoi(token)); + } + + return output; +} + +std::map> extract_tests_from_file(const std::string & fpath_test){ + if (fpath_test.empty()){ + fprintf(stderr, "%s : No test file found.\n", __func__); + return std::map>(); + } + + std::map> tests; + + auto fin = std::ifstream(fpath_test, std::ios_base::in); + const char * delimeter = " => "; + const char del_tok = ','; + std::string line; + while (std::getline(fin, line)) { + size_t delimiterPos = line.find(delimeter); + if (delimiterPos != std::string::npos) { + std::string text = line.substr(0, delimiterPos); + std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter)); + tests[text] = parse_tokens_from_string(s_tokens, del_tok); + } + } + return tests; +} + +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){ + std::map> tests = extract_tests_from_file(fpath_test); + + size_t n_fails = 0; + + for (const auto & test : tests) { + std::vector tokens = gpt_tokenize(vocab, test.first); + + if (tokens != test.second){ + n_fails++; + + // print out failure cases + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str()); + fprintf(stderr, "%s : tokens in hf: ", __func__); + for (const auto & t : test.second) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : tokens in ggml: ", __func__); + for (const auto & t : tokens) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + } + } + + fprintf(stderr, "%s : %lu tests failed out of %lu tests.\n", __func__, n_fails, tests.size()); +} + bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); diff --git a/examples/common.h b/examples/common.h index 4a24ffbc..0381802e 100644 --- a/examples/common.h +++ b/examples/common.h @@ -26,8 +26,9 @@ struct gpt_params { int32_t n_batch = 8; // batch size for prompt processing - std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path - std::string prompt; + std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path + std::string prompt = ""; + std::string token_test = ""; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); @@ -77,6 +78,15 @@ std::wstring convert_to_wstring(const std::string & input); // std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); +// test outputs of gpt_tokenize +// +// - compare with tokens generated by the huggingface tokenizer +// - test cases are chosen based on the model's main language (under 'prompt' directory) +// - if all sentences are tokenized identically, print 'All tests passed.' +// - otherwise, print sentence, huggingface tokens, ggml tokens +// +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test); + // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp index d2783e2a..7b020d14 100644 --- a/examples/dolly-v2/main.cpp +++ b/examples/dolly-v2/main.cpp @@ -707,6 +707,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } int n_past = 0; diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 47f5e5e4..931c6133 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -732,6 +732,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } int n_past = 0; diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index ee6d1a99..48d0ce16 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -641,6 +641,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } int n_past = 0; diff --git a/examples/gpt-neox/convert-h5-to-ggml.py b/examples/gpt-neox/convert-h5-to-ggml.py index 9d21226f..f11a4cbc 100644 --- a/examples/gpt-neox/convert-h5-to-ggml.py +++ b/examples/gpt-neox/convert-h5-to-ggml.py @@ -15,9 +15,6 @@ if len(sys.argv) < 3: dir_model = sys.argv[1] fname_out = sys.argv[1] + "/ggml-model.bin" -with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - encoder = json.load(f) - with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) @@ -39,9 +36,6 @@ if len(sys.argv) > 2: tokenizer = AutoTokenizer.from_pretrained(dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) -#print (model) - -#print(tokenizer.encode('I believe the meaning of life is')) list_vars = model.state_dict() for name in list_vars.keys(): @@ -62,11 +56,8 @@ fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_re fout.write(struct.pack("i", ftype)) # TODO: temporary hack to not deal with implementing the tokenizer -dot_token = tokenizer.encode('.')[0] for i in range(hparams["vocab_size"]): - text = tokenizer.decode([dot_token, i]).encode('utf-8') - # remove the first byte (it's always '.') - text = text[1:] + text = tokenizer.decode([i]).encode('utf-8') fout.write(struct.pack("i", len(text))) fout.write(text) @@ -81,10 +72,10 @@ for name in list_vars.keys(): print(" Skipping variable: " + name) continue - n_dims = len(data.shape); + n_dims = len(data.shape) # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0; + ftype_cur = 0 if ftype != 0: if name[-7:] == ".weight" and n_dims == 2: print(" Converting to float16") @@ -105,7 +96,7 @@ for name in list_vars.keys(): fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); + fout.write(str) # data data.tofile(fout) diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp index c5a73232..a53f98d5 100644 --- a/examples/gpt-neox/main.cpp +++ b/examples/gpt-neox/main.cpp @@ -704,6 +704,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } int n_past = 0; @@ -804,4 +806,4 @@ int main(int argc, char ** argv) { ggml_free(model.ctx); return 0; -} +} \ No newline at end of file diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index 84bb3f93..f90c48c6 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -70,6 +70,7 @@ struct mpt_params { std::string model = ""; // model path std::string prompt = ""; + std::string token_test = ""; bool perplexity = false; @@ -93,6 +94,8 @@ void mpt_print_usage(int /*argc*/, char ** argv, const mpt_params & params) { fprintf(stderr, " prompt to start generation with (default: random)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k); fprintf(stderr, " --top_p N top-p sampling (default: %.2f)\n", params.top_p); @@ -155,6 +158,8 @@ bool mpt_params_parse(int argc, char ** argv, mpt_params & params) { if (params.prompt.back() == '\n') { params.prompt.pop_back(); } + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = argv[++i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); mpt_print_usage(argc, argv, params); @@ -687,7 +692,7 @@ std::vector softmax(const std::vector & logits) { return probs; } -int perplexity(mpt_params params) { +int perplexity(const mpt_params & params) { ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); @@ -894,6 +899,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } if (params.top_k == 0) { diff --git a/examples/prompts/dolly-v2.txt b/examples/prompts/dolly-v2.txt new file mode 100644 index 00000000..ecdb0b7a --- /dev/null +++ b/examples/prompts/dolly-v2.txt @@ -0,0 +1,100 @@ +Hello World! => 12092,3645,2 +I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476 +The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449 +"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574 +'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464 +"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449 +The book costs $19.99 => 510,1984,4815,370,746,15,1525 +"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449 +Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2 +C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32 +W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2 +H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32 +I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15 +Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32 +The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15 +I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15 +The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15 +She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15 +We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15 +He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15 +They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15 +The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15 +I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15 +We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15 +The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15 +She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15 +He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15 +The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15 +I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15 +The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15 +They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15 +She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15 +We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15 +He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15 +The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15 +I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15 +They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15 +She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15 +He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15 +The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15 +I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15 +They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15 +She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15 +We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15 +The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15 +He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15 +The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15 +I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15 +They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15 +She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15 +We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15 +The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15 +He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15 +The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15 +I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15 +They are renovating their house. => 3726,403,30074,839,616,2419,15 +She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15 +We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15 +The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15 +He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15 +The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15 +I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15 +They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15 +She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15 +We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15 +The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15 +He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15 +The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15 +I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15 +They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15 +She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15 +We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15 +The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15 +He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15 +The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15 +I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15 +They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15 +She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15 +We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15 +The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15 +He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15 +The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15 +I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15 +They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15 +She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15 +We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15 +The traffic is congest => 510,7137,310,25801 +The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15 +I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15 +She plays the piano beautifully. => 2993,7120,253,18542,27839,15 +The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15 +I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15 +He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15 +The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15 +She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15 +The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15 +We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15 +He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15 +The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15 +She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15 diff --git a/examples/prompts/gpt-2-chinese.txt b/examples/prompts/gpt-2-chinese.txt new file mode 100644 index 00000000..919829d8 --- /dev/null +++ b/examples/prompts/gpt-2-chinese.txt @@ -0,0 +1 @@ +请问洗手间在哪里? => 6435,7309,3819,2797,7313,1762,1525,7027,8043 diff --git a/examples/prompts/gpt-2.txt b/examples/prompts/gpt-2.txt new file mode 100644 index 00000000..a2ed9310 --- /dev/null +++ b/examples/prompts/gpt-2.txt @@ -0,0 +1,100 @@ +Hello World! => 15496,2159,0 +I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474 +The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526 +"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496 +'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637 +"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526 +The book costs $19.99 => 464,1492,3484,720,1129,13,2079 +"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526 +Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0 +C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30 +W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0 +H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30 +I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13 +Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30 +The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13 +I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13 +The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13 +She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13 +We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13 +He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13 +They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13 +The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13 +I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13 +We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13 +The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13 +She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13 +He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13 +The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13 +I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13 +The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13 +They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13 +She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13 +We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13 +He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13 +The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13 +I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13 +They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13 +She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13 +He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13 +The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13 +I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13 +They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13 +She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13 +We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13 +The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13 +He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13 +The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13 +I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13 +They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13 +She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13 +We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13 +The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13 +He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13 +The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13 +I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13 +They are renovating their house. => 2990,389,24317,803,511,2156,13 +She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13 +We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13 +The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13 +He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13 +The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13 +I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13 +They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13 +She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13 +We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13 +The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13 +He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13 +The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13 +I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13 +They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13 +She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13 +We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13 +The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13 +He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13 +The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13 +I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13 +They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13 +She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13 +We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13 +The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13 +He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13 +The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13 +I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13 +They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13 +She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13 +We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13 +The traffic is congest => 464,4979,318,22791 +The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13 +I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13 +She plays the piano beautifully. => 3347,5341,262,19132,21104,13 +The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13 +I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13 +He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13 +The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13 +She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13 +The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13 +We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13 +He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13 +The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13 +She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13 diff --git a/examples/prompts/gpt-j.txt b/examples/prompts/gpt-j.txt new file mode 100644 index 00000000..a2ed9310 --- /dev/null +++ b/examples/prompts/gpt-j.txt @@ -0,0 +1,100 @@ +Hello World! => 15496,2159,0 +I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474 +The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526 +"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496 +'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637 +"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526 +The book costs $19.99 => 464,1492,3484,720,1129,13,2079 +"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526 +Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0 +C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30 +W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0 +H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30 +I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13 +Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30 +The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13 +I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13 +The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13 +She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13 +We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13 +He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13 +They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13 +The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13 +I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13 +We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13 +The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13 +She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13 +He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13 +The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13 +I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13 +The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13 +They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13 +She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13 +We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13 +He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13 +The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13 +I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13 +They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13 +She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13 +He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13 +The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13 +I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13 +They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13 +She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13 +We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13 +The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13 +He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13 +The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13 +I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13 +They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13 +She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13 +We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13 +The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13 +He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13 +The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13 +I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13 +They are renovating their house. => 2990,389,24317,803,511,2156,13 +She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13 +We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13 +The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13 +He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13 +The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13 +I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13 +They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13 +She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13 +We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13 +The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13 +He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13 +The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13 +I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13 +They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13 +She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13 +We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13 +The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13 +He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13 +The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13 +I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13 +They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13 +She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13 +We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13 +The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13 +He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13 +The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13 +I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13 +They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13 +She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13 +We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13 +The traffic is congest => 464,4979,318,22791 +The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13 +I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13 +She plays the piano beautifully. => 3347,5341,262,19132,21104,13 +The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13 +I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13 +He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13 +The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13 +She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13 +The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13 +We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13 +He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13 +The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13 +She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13 diff --git a/examples/prompts/gpt-neox-japanese.txt b/examples/prompts/gpt-neox-japanese.txt new file mode 100644 index 00000000..c39df160 --- /dev/null +++ b/examples/prompts/gpt-neox-japanese.txt @@ -0,0 +1 @@ +明日の天気はどうですか。 => 263,7353,268,18461,271,1722,18405,265 diff --git a/examples/prompts/gpt-neox.txt b/examples/prompts/gpt-neox.txt new file mode 100644 index 00000000..ecdb0b7a --- /dev/null +++ b/examples/prompts/gpt-neox.txt @@ -0,0 +1,100 @@ +Hello World! => 12092,3645,2 +I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476 +The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449 +"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574 +'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464 +"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449 +The book costs $19.99 => 510,1984,4815,370,746,15,1525 +"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449 +Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2 +C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32 +W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2 +H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32 +I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15 +Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32 +The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15 +I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15 +The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15 +She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15 +We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15 +He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15 +They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15 +The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15 +I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15 +We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15 +The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15 +She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15 +He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15 +The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15 +I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15 +The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15 +They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15 +She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15 +We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15 +He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15 +The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15 +I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15 +They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15 +She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15 +He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15 +The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15 +I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15 +They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15 +She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15 +We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15 +The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15 +He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15 +The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15 +I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15 +They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15 +She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15 +We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15 +The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15 +He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15 +The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15 +I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15 +They are renovating their house. => 3726,403,30074,839,616,2419,15 +She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15 +We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15 +The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15 +He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15 +The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15 +I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15 +They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15 +She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15 +We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15 +The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15 +He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15 +The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15 +I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15 +They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15 +She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15 +We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15 +The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15 +He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15 +The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15 +I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15 +They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15 +She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15 +We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15 +The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15 +He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15 +The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15 +I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15 +They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15 +She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15 +We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15 +The traffic is congest => 510,7137,310,25801 +The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15 +I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15 +She plays the piano beautifully. => 2993,7120,253,18542,27839,15 +The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15 +I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15 +He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15 +The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15 +She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15 +The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15 +We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15 +He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15 +The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15 +She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15 diff --git a/examples/prompts/polyglot-ko.txt b/examples/prompts/polyglot-ko.txt new file mode 100644 index 00000000..41fa0085 --- /dev/null +++ b/examples/prompts/polyglot-ko.txt @@ -0,0 +1,3 @@ +이것은 테스트 이다. => 12271,296,6474,28037,17 +걱정할 필요 없다. => 18311,482,1062,550,267,17 +버그는 언젠가 고쳐진다. => 6904,272,8575,10381,1765,17 diff --git a/examples/prompts/replit.txt b/examples/prompts/replit.txt new file mode 100644 index 00000000..7b5ffcf1 --- /dev/null +++ b/examples/prompts/replit.txt @@ -0,0 +1,100 @@ +Hello World! => 6466,147,2317,350 +I can't believe it's already Friday!" => 286,512,172,185,13392,393,172,155,3239,147,29249,8537 +The URL for the website is https://www.example.com." => 505,5635,250,170,11745,235,147,303,262,552,148,811,148,241,148,161 +"She said, 'I love to travel.'" => 161,10386,4089,150,206,286,8440,194,147,12363,148,172,161 +'The temperature is 25.5°C.' => 172,505,147,9502,235,147,20022,8516,228,148,172 +"Let's meet at 2:30 p.m. in the park." => 161,8997,172,155,17120,536,147,162,5245,147,207,148,204,148,219,170,147,17664,148,161 +The book costs $19.99 => 505,147,2277,17494,236,166,11824 +"John's favorite color is blue." => 161,7475,172,155,147,11105,147,349,235,17046,148,161 +Th@nk y0u f0r y0ur h3lp! => 6309,240,9019,147,237,159,247,147,202,159,223,147,237,159,2458,147,226,171,3899,350 +C@n I g3t a c0ffee, pl3@se? => 228,240,211,398,147,267,171,185,216,147,196,159,13360,163,150,147,1287,171,240,155,163,272 +W0w! Th@t's @m@zing! => 450,159,274,350,147,6309,240,185,172,155,268,204,240,301,248,350 +H0w 4re y0u t0d@y? => 304,159,274,320,440,147,237,159,247,147,185,159,182,240,237,272 +I l0ve t0 tr@vel @r0und the w0rld. => 286,997,159,1290,147,185,159,147,490,240,3893,268,223,159,3981,170,147,274,159,223,2833,148 +Wh@t's y0ur f@v0rite m0vie? => 450,226,240,185,172,155,147,237,159,2458,147,202,240,252,159,5961,163,147,204,159,24373,272 +The cat is sleeping on the mat. => 505,147,1604,235,147,3987,248,347,170,147,1297,148 +I need to buy some groceries for dinner. => 286,1645,194,147,8068,1499,147,10022,1037,10023,250,147,182,2749,148 +The sun is shining brightly in the sky. => 505,147,5852,235,147,7304,2967,147,215,649,391,219,170,147,7310,148 +She is reading a book in the park. => 10386,235,9838,216,147,2277,219,170,147,17664,148 +We went for a walk on the beach yesterday. => 3250,10825,250,216,147,8156,347,170,294,5371,147,28830,148 +He plays the guitar like a pro. => 5301,7084,155,170,147,4604,2214,1425,216,3474,148 +They are going to the movies tonight. => 18815,429,6552,194,170,147,15877,194,7907,148 +The flowers are blooming in the garden. => 505,147,22953,155,429,147,10411,2799,248,219,170,147,22140,148 +I enjoy listening to classical music. => 286,23162,15876,248,194,239,4251,147,7395,148 +We need to buy groceries for the week. => 3250,1645,194,147,8068,147,10022,1037,10023,250,170,9238,148 +The dog is chasing its tail in circles. => 505,147,6540,235,147,196,916,248,1602,147,5129,219,147,4095,155,148 +She is wearing a beautiful red dress. => 10386,235,147,16427,248,216,147,23447,147,1160,147,14592,148 +He is a talented actor in Hollywood. => 5301,235,216,147,29750,246,147,5112,219,147,16924,391,10477,148 +The children are playing in the playground. => 505,7934,429,7084,248,219,170,7084,12055,148 +I'm going to visit my grandparents this weekend. => 286,172,204,6552,194,9939,1247,147,11806,12019,291,9238,314,148 +The coffee tastes bitter without sugar. => 505,147,21526,147,20931,155,5145,1430,1988,147,28759,148 +They are planning a surprise party for her. => 18815,429,147,23661,216,147,29240,147,7344,250,1869,148 +She sings like an angel on stage. => 10386,147,155,6502,1425,426,147,26028,347,12685,148 +We should take a vacation to relax. => 3250,936,4654,216,147,15388,946,194,1998,2744,148 +He is studying medicine at the university. => 5301,235,7959,248,147,20742,1668,536,170,147,8025,148 +The rain is pouring heavily outside. => 505,147,6885,235,5306,248,1189,5451,391,8096,148 +I enjoy watching romantic movies. => 286,23162,147,3355,248,147,26080,4140,147,15877,148 +They are celebrating their anniversary today. => 18815,429,147,30000,5841,1669,147,24734,5464,1770,13386,148 +She dances gracefully to the music. => 10386,147,182,1626,155,147,267,8771,8001,194,170,147,7395,148 +He is an excellent basketball player. => 5301,235,426,147,12300,675,185,147,26646,5132,6294,148 +The baby is sleeping soundly in the crib. => 505,147,23597,235,147,3987,248,12642,391,219,170,147,7696,215,148 +I need to finish my homework before dinner. => 286,1645,194,147,6717,1247,147,1071,2722,2643,147,182,2749,148 +They are organizing a charity event next month. => 18815,429,147,16442,248,216,1054,1511,1663,2399,12821,148 +She is cooking a delicious meal for us. => 10386,235,147,20453,248,216,3936,23455,147,26658,250,147,539,148 +We should go hiking in the mountains. => 3250,936,4242,147,2254,5357,219,170,147,204,18028,155,148 +The car broke down on the way to work. => 505,7553,147,510,10036,4288,347,170,3699,194,1916,148 +He loves playing video games in his free time. => 5301,8440,155,7084,248,8722,147,11281,219,1439,4002,801,148 +The birds are chirping in the trees. => 505,147,13043,155,429,147,3904,223,4639,219,170,5311,155,148 +I want to learn how to play the piano. => 286,1857,194,14167,2496,194,7084,170,147,207,23635,148 +They are building a new shopping mall in the city. => 18815,429,11038,216,277,147,22184,147,204,609,219,170,147,2416,148 +She is writing a novel in her spare time. => 10386,235,3242,216,147,25814,219,1869,6772,2382,801,148 +We are going to the zoo this Saturday. => 3250,429,6552,194,170,147,25101,291,147,31426,148 +The cake looks delicious with chocolate frosting. => 505,147,24422,16303,3936,23455,312,147,5619,533,2239,147,202,3973,3431,148 +He is a talented painter who sells his artwork. => 5301,235,216,147,29750,246,147,9226,279,2888,13004,155,1439,12234,2722,148 +The students are studying for their exams. => 505,15707,429,7959,248,250,1669,147,12398,155,148 +I enjoy swimming in the ocean. => 286,23162,147,4729,8528,248,219,170,147,26193,148 +They are renovating their house. => 18815,429,991,10724,3643,1669,13788,148 +She is practicing yoga to stay healthy. => 10386,235,147,18453,248,147,5063,1186,194,15344,147,28550,148 +We should plant flowers in the garden. => 3250,936,147,9212,147,22953,155,219,170,147,22140,148 +The traffic is heavy during rush hour. => 505,147,11097,235,147,22232,4340,147,22319,147,5686,148 +He is a skilled chef who creates amazing dishes. => 5301,235,216,147,8891,246,9784,202,2888,13720,147,28880,147,23852,383,148 +The baby is crawling on the floor. => 505,147,23597,235,147,22120,248,347,170,147,5895,148 +I need to buy a new pair of shoes. => 286,1645,194,147,8068,216,277,12632,210,147,155,21953,155,148 +They are going on a road trip across the country. => 18815,429,6552,347,216,147,6362,147,11395,9762,170,11305,148 +She is playing the piano beautifully. => 10386,235,7084,248,170,147,207,23635,147,23447,391,148 +We are going to a concert tomorrow night. => 3250,429,6552,194,216,1710,4391,29524,12716,148 +The cake tastes delicious with vanilla frosting. => 505,147,24422,147,20931,155,3936,23455,312,5535,7476,147,202,3973,3431,148 +He is a dedicated teacher who inspires his students. => 5301,235,216,326,8298,3460,147,9675,2888,147,28801,155,1439,15707,148 +The students are participating in a science fair. => 505,15707,429,147,30961,3643,219,216,147,10587,147,7636,148 +I enjoy hiking in the mountains. => 286,23162,147,2254,5357,219,170,147,204,18028,155,148 +They are organizing a beach cleanup next weekend. => 18815,429,147,16442,248,216,294,5371,147,10401,2399,9238,314,148 +She is taking photographs of nature. => 10386,235,147,12345,147,4709,1547,155,210,147,211,8603,148 +We should try a new restaurant in town. => 3250,936,147,746,216,277,147,11007,219,147,10200,148 +The traffic is moving slowly on the highway. => 505,147,11097,235,147,8601,147,9880,391,347,170,5976,3330,148 +He is a talented singer with a beautiful voice. => 5301,235,216,147,29750,246,147,155,248,279,312,216,147,23447,147,9316,148 +The baby is laughing and giggling. => 505,147,23597,235,147,23066,248,221,147,2341,3631,2869,148 +I need to do laundry and wash my clothes. => 286,1645,194,543,960,3981,2154,221,147,27589,1247,147,22141,383,148 +They are planning a trip to Europe. => 18815,429,147,23661,216,147,11395,194,13131,148 +She is learning how to play the guitar. => 10386,235,11754,2496,194,7084,170,147,4604,2214,148 +We are going to a museum this Sunday. => 3250,429,6552,194,216,147,204,433,1177,291,147,29111,148 +The coffee smells amazing in the morning. => 505,147,21526,31454,155,147,28880,219,170,20701,148 +He is a hardworking farmer who grows crops. => 5301,235,216,8524,14992,147,16679,279,2888,147,6044,155,147,8650,155,148 +The students are presenting their research projects. => 505,15707,429,5130,248,1669,13217,14235,148 +I enjoy playing soccer with my friends. => 286,23162,7084,248,147,9351,5318,312,1247,147,5347,155,148 +They are volunteering at a local shelter. => 18815,429,147,5238,7478,163,12798,536,216,2491,2905,1359,279,148 +She is practicing martial arts for self-defense. => 10386,235,147,18453,248,147,3261,185,4381,12234,155,250,623,153,29896,148 +We should try a new recipe for dinner. => 3250,936,147,746,216,277,147,9851,250,147,182,2749,148 +The traffic is congest => 505,147,11097,235,1710,14169 +The sun is shining brightly today. => 505,147,5852,235,147,7304,2967,147,215,649,391,13386,148 +I enjoy reading books in my free time. => 286,23162,9838,147,9670,219,1247,4002,801,148 +She plays the piano beautifully. => 10386,7084,155,170,147,207,23635,147,23447,391,148 +The cat chased the mouse around the room. => 505,147,1604,147,196,916,246,170,12551,6890,170,9654,148 +I love eating pizza with extra cheese. => 286,8440,147,163,3643,147,207,8403,312,8230,9784,383,163,148 +He always wears a hat wherever he goes. => 5301,5418,147,16427,155,216,147,4879,2171,2433,1189,16177,148 +The flowers in the garden are blooming. => 505,147,22953,155,219,170,147,22140,429,147,10411,2799,248,148 +She danced gracefully on the stage. => 10386,13378,12408,147,267,8771,8001,347,170,12685,148 +The dog barked loudly in the park. => 505,147,6540,147,973,293,246,147,30182,391,219,170,147,17664,148 +We went swimming in the ocean yesterday. => 3250,10825,147,4729,8528,248,219,170,147,26193,147,28830,148 +He speaks fluent French and Spanish. => 5301,147,13285,155,147,21677,147,254,17590,221,147,31519,148 +The train arrived at the station on time. => 505,147,872,147,20712,182,536,170,147,7184,347,801,148 +She cooked a delicious meal for her family. => 10386,147,20453,246,216,3936,23455,147,26658,250,1869,147,2002,148 diff --git a/examples/prompts/starcoder.txt b/examples/prompts/starcoder.txt new file mode 100644 index 00000000..03a5b221 --- /dev/null +++ b/examples/prompts/starcoder.txt @@ -0,0 +1,100 @@ +Hello World! => 8279,10896,19 +I can't believe it's already Friday!" => 59,883,1330,13710,561,1182,3425,506,25674,11555 +The URL for the website is https://www.example.com." => 1318,3834,436,322,9575,438,1678,555,1499,32,2763,32,508,3107 +"She said, 'I love to travel.'" => 20,25387,9884,30,330,59,14290,372,25283,29329 +'The temperature is 25.5°C.' => 25,1318,13587,438,225,36,39,32,39,23767,53,4564 +"Let's meet at 2:30 p.m. in the park." => 20,9809,1182,18450,821,225,36,44,37,34,298,32,95,32,328,322,880,93,3107 +The book costs $19.99 => 1318,7618,25950,398,35,43,32,43,43 +"John's favorite color is blue." => 20,19693,1182,27448,1963,438,10087,3107 +Th@nk y0u f0r y0ur h3lp! => 1027,50,19877,533,34,103,296,34,100,533,34,305,420,37,1915,19 +C@n I g3t a c0ffee, pl3@se? => 53,50,96,439,485,37,102,312,281,34,21298,30,1278,37,50,277,49 +W0w! Th@t's @m@zing! => 73,34,105,19,947,50,102,1182,477,95,50,26768,19 +H0w 4re y0u t0d@y? => 58,34,105,225,38,268,533,34,103,273,34,86,50,107,49 +I l0ve t0 tr@vel @r0und the w0rld. => 59,456,34,587,273,34,554,50,1203,477,100,34,642,322,341,34,100,1381,32 +Wh@t's y0ur f@v0rite m0vie? => 2444,50,102,1182,533,34,305,296,50,104,34,1049,345,34,104,1075,49 +The cat is sleeping on the mat. => 1318,10501,438,9368,299,544,322,2491,32 +I need to buy some groceries for dinner. => 59,1849,372,16968,1629,20234,85,6958,436,343,3369,32 +The sun is shining brightly in the sky. => 1318,15323,438,787,19068,38231,631,328,322,26718,32 +She is reading a book in the park. => 25387,438,9175,312,7618,328,322,880,93,32 +We went for a walk on the beach yesterday. => 3122,14236,436,312,13503,544,322,526,867,39485,32 +He plays the guitar like a pro. => 1331,41271,322,3932,19931,2124,312,534,32 +They are going to the movies tonight. => 31805,884,6783,372,322,27889,26076,694,32 +The flowers are blooming in the garden. => 1318,7290,483,884,323,18466,299,328,322,485,22461,32 +I enjoy listening to classical music. => 59,31567,20498,372,443,1578,17522,32 +We need to buy groceries for the week. => 3122,1849,372,16968,20234,85,6958,436,322,8209,32 +The dog is chasing its tail in circles. => 1318,27435,438,663,9949,2819,13203,328,46428,32 +She is wearing a beautiful red dress. => 25387,438,996,6992,312,36493,3346,343,714,32 +He is a talented actor in Hollywood. => 1331,438,312,273,9556,318,16038,328,48228,631,21118,32 +The children are playing in the playground. => 1318,5713,884,19788,328,322,4654,1749,32 +I'm going to visit my grandparents this weekend. => 59,3464,6783,372,7725,1672,33162,19277,458,40618,32 +The coffee tastes bitter without sugar. => 1318,36917,273,633,307,3493,391,2876,309,18628,32 +They are planning a surprise party for her. => 31805,884,26116,312,6178,9251,15270,436,7791,32 +She sings like an angel on stage. => 25387,309,2052,2124,600,600,17691,544,10019,32 +We should take a vacation to relax. => 3122,1395,4818,312,29164,367,372,41972,32 +He is studying medicine at the university. => 1331,438,14866,299,32388,482,821,322,707,9190,32 +The rain is pouring heavily outside. => 1318,36987,438,9202,299,46003,2801,11127,32 +I enjoy watching romantic movies. => 59,31567,37652,26045,7268,27889,32 +They are celebrating their anniversary today. => 31805,884,48278,839,1741,3623,23921,5810,672,11610,32 +She dances gracefully to the music. => 25387,343,3151,31376,4938,372,322,17522,32 +He is an excellent basketball player. => 1331,438,600,39203,48400,11653,4362,32 +The baby is sleeping soundly in the crib. => 1318,323,17156,438,9368,299,9934,631,328,322,281,7972,32 +I need to finish my homework before dinner. => 59,1849,372,11361,1672,6765,1007,2670,343,3369,32 +They are organizing a charity event next month. => 31805,884,10558,6183,312,1351,543,1692,2354,6811,32 +She is cooking a delicious meal for us. => 25387,438,23682,299,312,409,406,2406,597,279,436,1770,32 +We should go hiking in the mountains. => 3122,1395,1983,420,1546,299,328,322,10874,1907,32 +The car broke down on the way to work. => 1318,6346,43289,2835,544,322,3352,372,1389,32 +He loves playing video games in his free time. => 1331,598,4954,19788,6027,19705,328,6697,3741,1133,32 +The birds are chirping in the trees. => 1318,8424,3210,884,663,476,7075,328,322,23453,32 +I want to learn how to play the piano. => 59,2637,372,7350,2624,372,4654,322,298,25757,32 +They are building a new shopping mall in the city. => 31805,884,9038,312,537,40692,345,464,328,322,11297,32 +She is writing a novel in her spare time. => 25387,438,4127,312,32913,328,7791,1869,586,1133,32 +We are going to the zoo this Saturday. => 3122,884,6783,372,322,1288,604,458,358,30288,32 +The cake looks delicious with chocolate frosting. => 1318,281,1062,7780,409,406,2406,623,10408,27589,296,20932,299,32 +He is a talented painter who sells his artwork. => 1331,438,312,273,9556,318,42300,6560,10800,101,6697,5549,1007,32 +The students are studying for their exams. => 1318,16512,884,14866,299,436,3623,538,1462,32 +I enjoy swimming in the ocean. => 59,31567,2535,449,6714,328,322,337,18857,32 +They are renovating their house. => 31805,884,316,15007,1741,3623,17075,32 +She is practicing yoga to stay healthy. => 25387,438,11808,11636,533,40067,372,20005,44538,32 +We should plant flowers in the garden. => 3122,1395,26795,7290,483,328,322,485,22461,32 +The traffic is heavy during rush hour. => 1318,16391,438,32389,5929,540,1372,12021,32 +He is a skilled chef who creates amazing dishes. => 1331,438,312,3001,12088,44051,6560,9585,36986,1214,4279,32 +The baby is crawling on the floor. => 1318,323,17156,438,281,1294,2920,544,322,17648,32 +I need to buy a new pair of shoes. => 59,1849,372,16968,312,537,6092,432,787,37764,32 +They are going on a road trip across the country. => 31805,884,6783,544,312,24122,19337,10160,322,10769,32 +She is playing the piano beautifully. => 25387,438,19788,322,298,25757,526,4846,325,514,107,32 +We are going to a concert tomorrow night. => 3122,884,6783,372,312,457,6989,31841,19212,32 +The cake tastes delicious with vanilla frosting. => 1318,281,1062,273,633,307,409,406,2406,623,44653,296,20932,299,32 +He is a dedicated teacher who inspires his students. => 1331,438,312,23112,30877,6560,26194,8017,6697,16512,32 +The students are participating in a science fair. => 1318,16512,884,24623,1741,328,312,27536,19375,32 +I enjoy hiking in the mountains. => 59,31567,420,1546,299,328,322,10874,1907,32 +They are organizing a beach cleanup next weekend. => 31805,884,10558,6183,312,526,867,13144,2354,40618,32 +She is taking photographs of nature. => 25387,438,15137,15110,23626,432,24406,32 +We should try a new restaurant in town. => 3122,1395,1596,312,537,43719,328,38212,32 +The traffic is moving slowly on the highway. => 1318,16391,438,14089,12899,631,544,322,3857,3073,32 +He is a talented singer with a beautiful voice. => 1331,438,312,273,9556,318,309,10118,623,312,36493,20309,32 +The baby is laughing and giggling. => 1318,323,17156,438,2317,2943,299,461,485,365,36088,32 +I need to do laundry and wash my clothes. => 59,1849,372,745,2317,642,994,461,341,917,1672,7375,46948,32 +They are planning a trip to Europe. => 31805,884,26116,312,19337,372,27268,32 +She is learning how to play the guitar. => 25387,438,9608,2624,372,4654,322,3932,19931,32 +We are going to a museum this Sunday. => 3122,884,6783,372,312,345,539,378,458,358,28036,32 +The coffee smells amazing in the morning. => 1318,36917,309,42153,101,36986,328,322,33768,32 +He is a hardworking farmer who grows crops. => 1331,438,312,6784,13578,9019,2302,6560,485,2138,25170,1069,32 +The students are presenting their research projects. => 1318,16512,884,5024,299,3623,13234,8528,32 +I enjoy playing soccer with my friends. => 59,31567,19788,22682,10035,623,1672,22523,32 +They are volunteering at a local shelter. => 31805,884,3920,45585,8637,821,312,2196,309,2542,391,32 +She is practicing martial arts for self-defense. => 25387,438,11808,11636,345,502,564,5549,101,436,630,31,43694,32 +We should try a new recipe for dinner. => 3122,1395,1596,312,537,15233,436,343,3369,32 +The traffic is congest => 1318,16391,438,457,2776 +The sun is shining brightly today. => 1318,15323,438,787,19068,38231,631,11610,32 +I enjoy reading books in my free time. => 59,31567,9175,21739,328,1672,3741,1133,32 +She plays the piano beautifully. => 25387,41271,322,298,25757,526,4846,325,514,107,32 +The cat chased the mouse around the room. => 1318,10501,663,16109,322,8459,6835,322,8355,32 +I love eating pizza with extra cheese. => 59,14290,484,1741,47630,623,6717,8277,30315,32 +He always wears a hat wherever he goes. => 1331,5182,996,4177,312,25793,2154,424,938,13107,32 +The flowers in the garden are blooming. => 1318,7290,483,328,322,485,22461,884,323,18466,299,32 +She danced gracefully on the stage. => 25387,343,6087,31376,4938,544,322,10019,32 +The dog barked loudly in the park. => 1318,27435,323,1087,318,598,836,631,328,322,880,93,32 +We went swimming in the ocean yesterday. => 3122,14236,2535,449,6714,328,322,337,18857,39485,32 +He speaks fluent French and Spanish. => 1331,24498,101,38055,43652,461,14911,1708,32 +The train arrived at the station on time. => 1318,5683,2099,32114,821,322,18662,544,1133,32 +She cooked a delicious meal for her family. => 25387,23682,318,312,409,406,2406,597,279,436,7791,13872,32 diff --git a/examples/prompts/test-cases.txt b/examples/prompts/test-cases.txt new file mode 100644 index 00000000..4d0bdbf9 --- /dev/null +++ b/examples/prompts/test-cases.txt @@ -0,0 +1,110 @@ +# test case format +# : + +English: Hello World! +English: I can't believe it's already Friday!" +English: The URL for the website is https://www.example.com." +English: "She said, 'I love to travel.'" +English: 'The temperature is 25.5°C.' +English: "Let's meet at 2:30 p.m. in the park." +English: The book costs $19.99 +English: "John's favorite color is blue." +English: Th@nk y0u f0r y0ur h3lp! +English: C@n I g3t a c0ffee, pl3@se? +English: W0w! Th@t's @m@zing! +English: H0w 4re y0u t0d@y? +English: I l0ve t0 tr@vel @r0und the w0rld. +English: Wh@t's y0ur f@v0rite m0vie? +English: The cat is sleeping on the mat. +English: I need to buy some groceries for dinner. +English: The sun is shining brightly in the sky. +English: She is reading a book in the park. +English: We went for a walk on the beach yesterday. +English: He plays the guitar like a pro. +English: They are going to the movies tonight. +English: The flowers are blooming in the garden. +English: I enjoy listening to classical music. +English: We need to buy groceries for the week. +English: The dog is chasing its tail in circles. +English: She is wearing a beautiful red dress. +English: He is a talented actor in Hollywood. +English: The children are playing in the playground. +English: I'm going to visit my grandparents this weekend. +English: The coffee tastes bitter without sugar. +English: They are planning a surprise party for her. +English: She sings like an angel on stage. +English: We should take a vacation to relax. +English: He is studying medicine at the university. +English: The rain is pouring heavily outside. +English: I enjoy watching romantic movies. +English: They are celebrating their anniversary today. +English: She dances gracefully to the music. +English: He is an excellent basketball player. +English: The baby is sleeping soundly in the crib. +English: I need to finish my homework before dinner. +English: They are organizing a charity event next month. +English: She is cooking a delicious meal for us. +English: We should go hiking in the mountains. +English: The car broke down on the way to work. +English: He loves playing video games in his free time. +English: The birds are chirping in the trees. +English: I want to learn how to play the piano. +English: They are building a new shopping mall in the city. +English: She is writing a novel in her spare time. +English: We are going to the zoo this Saturday. +English: The cake looks delicious with chocolate frosting. +English: He is a talented painter who sells his artwork. +English: The students are studying for their exams. +English: I enjoy swimming in the ocean. +English: They are renovating their house. +English: She is practicing yoga to stay healthy. +English: We should plant flowers in the garden. +English: The traffic is heavy during rush hour. +English: He is a skilled chef who creates amazing dishes. +English: The baby is crawling on the floor. +English: I need to buy a new pair of shoes. +English: They are going on a road trip across the country. +English: She is playing the piano beautifully. +English: We are going to a concert tomorrow night. +English: The cake tastes delicious with vanilla frosting. +English: He is a dedicated teacher who inspires his students. +English: The students are participating in a science fair. +English: I enjoy hiking in the mountains. +English: They are organizing a beach cleanup next weekend. +English: She is taking photographs of nature. +English: We should try a new restaurant in town. +English: The traffic is moving slowly on the highway. +English: He is a talented singer with a beautiful voice. +English: The baby is laughing and giggling. +English: I need to do laundry and wash my clothes. +English: They are planning a trip to Europe. +English: She is learning how to play the guitar. +English: We are going to a museum this Sunday. +English: The coffee smells amazing in the morning. +English: He is a hardworking farmer who grows crops. +English: The students are presenting their research projects. +English: I enjoy playing soccer with my friends. +English: They are volunteering at a local shelter. +English: She is practicing martial arts for self-defense. +English: We should try a new recipe for dinner. +English: The traffic is congest +English: The sun is shining brightly today. +English: I enjoy reading books in my free time. +English: She plays the piano beautifully. +English: The cat chased the mouse around the room. +English: I love eating pizza with extra cheese. +English: He always wears a hat wherever he goes. +English: The flowers in the garden are blooming. +English: She danced gracefully on the stage. +English: The dog barked loudly in the park. +English: We went swimming in the ocean yesterday. +English: He speaks fluent French and Spanish. +English: The train arrived at the station on time. +English: She cooked a delicious meal for her family. +Korean: 이것은 테스트 이다. +Korean: 걱정할 필요 없다. +Korean: 버그는 언젠가 고쳐진다. +Japanese: 明日の天気はどうですか。 +Chinese: 请问洗手间在哪里? +Emoji: I'm feeling 😄 today! +Unicode: ◑ ▢ ▣ ◱ \ No newline at end of file diff --git a/examples/prompts/tokenize_huggingface.py b/examples/prompts/tokenize_huggingface.py new file mode 100644 index 00000000..627771fb --- /dev/null +++ b/examples/prompts/tokenize_huggingface.py @@ -0,0 +1,65 @@ +import os +from transformers import AutoTokenizer + +os.environ['TOKENIZERS_PARALLELISM'] = "false" + +list_repo_hf = ["databricks/dolly-v2-3b", # dolly-v2 (3b, 7b, 12b models share the same tokenizer) + "gpt2", # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer) + "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese + "EleutherAI/gpt-j-6b", # gpt-j + "EleutherAI/gpt-neox-20b", # gpt-neox + "EleutherAI/polyglot-ko-1.3b", # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer") + "rinna/japanese-gpt-neox-3.6b", # gpt-neox + # mpt-7b (uses gpt-neox-20b tokenizer) + "replit/replit-code-v1-3b", # replit + "bigcode/starcoder", # starcoder (huggingface-cli login required) + "openai/whisper-tiny" # whisper (base, large, large-v2 share the same tokenizer) + ] + +repo2ggml = {"databricks/dolly-v2-3b" : "dolly-v2", + "gpt2" : "gpt-2", + "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese", + "EleutherAI/gpt-j-6b" : "gpt-j", + "EleutherAI/gpt-neox-20b" : "gpt-neox", + "EleutherAI/polyglot-ko-1.3b" : "polyglot-ko", + "rinna/japanese-gpt-neox-3.6b" : "gpt-neox-japanese", + "replit/replit-code-v1-3b" : "replit", + "bigcode/starcoder" : "starcoder", + "openai/whisper-tiny" : "whisper"} + +repo2language = {"databricks/dolly-v2-3b" : "english", + "gpt2" : "english", + "uer/gpt2-chinese-cluecorpussmall" : "chinese", + "EleutherAI/gpt-j-6b" : "english", + "EleutherAI/gpt-neox-20b" : "english", + "EleutherAI/polyglot-ko-1.3b" : "korean", + "rinna/japanese-gpt-neox-3.6b" : "japanese", + "replit/replit-code-v1-3b" : "english", + "bigcode/starcoder" : "english", + "openai/whisper-tiny" : "english"} + +delimeter = ": " +test_sentences = [] +with open("test-cases.txt", "r") as f: + lines = [l.rstrip() for l in f.readlines()] + for l in lines: + if delimeter in l: + language = l[:l.index(delimeter)] + sentence = l[l.index(delimeter) + len(delimeter):] + test_sentences.append((language.lower(), sentence)) + +for repo in list_repo_hf: + + target_language = repo2language[repo] + + tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True) + + tokens_hf = [] + for language, sentence in test_sentences: + if language == target_language: + tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)) + tokens_hf.append((sentence, tokens)) + + save_txt = repo2ggml[repo] + ".txt" + with open(save_txt, "w") as f: + f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf]) diff --git a/examples/prompts/whisper.txt b/examples/prompts/whisper.txt new file mode 100644 index 00000000..a8f1caaf --- /dev/null +++ b/examples/prompts/whisper.txt @@ -0,0 +1,100 @@ +Hello World! => 15947,3937,0 +I can't believe it's already Friday!" => 40,393,380,1697,309,311,1217,6984,2963 +The URL for the website is https://www.example.com." => 2278,12905,337,220,3322,3144,307,34426,21492,17919,13,3121,335,781,13,1112,889 +"She said, 'I love to travel.'" => 1,9526,848,11,922,40,959,220,1353,220,17227,779,28763 +'The temperature is 25.5°C.' => 6,2278,220,18275,610,1503,307,3552,13,20,11782,34,4443 +"Let's meet at 2:30 p.m. in the park." => 1,8373,311,1677,412,568,25,3446,280,13,76,13,294,220,3322,3884,889 +The book costs $19.99 => 2278,1446,5497,1848,3405,13,8494 +"John's favorite color is blue." => 1,16938,311,2954,2017,307,3344,889 +Th@nk y0u f0r y0ur h3lp! => 2434,31,77,74,288,15,84,283,15,81,288,15,374,276,18,75,79,0 +C@n I g3t a c0ffee, pl3@se? => 34,31,77,286,290,18,83,257,269,15,4617,11,499,18,31,405,30 +W0w! Th@t's @m@zing! => 54,15,86,0,334,31,83,311,10428,76,31,8781,0 +H0w 4re y0u t0d@y? => 39,15,86,1017,265,288,15,84,220,83,15,67,31,88,30 +I l0ve t0 tr@vel @r0und the w0rld. => 40,287,15,303,220,83,15,220,6903,31,779,10428,81,15,997,220,3322,261,15,81,348,13 +Wh@t's y0ur f@v0rite m0vie? => 2471,31,83,311,288,15,374,283,31,85,15,35002,275,15,12702,30 +The cat is sleeping on the mat. => 2278,3857,307,8296,322,220,3322,3803,13 +I need to buy some groceries for dinner. => 40,643,220,1353,2256,512,31391,337,6148,13 +The sun is shining brightly in the sky. => 2278,3295,307,18269,47418,294,220,3322,5443,13 +She is reading a book in the park. => 9526,307,3760,257,1446,294,220,3322,3884,13 +We went for a walk on the beach yesterday. => 4360,1437,337,257,1792,322,220,3322,7534,5186,13 +He plays the guitar like a pro. => 5205,5749,220,3322,7531,411,257,447,13 +They are going to the movies tonight. => 8829,366,516,220,1353,220,3322,6233,220,1756,397,13 +The flowers are blooming in the garden. => 2278,8085,366,45294,294,220,3322,7431,13 +I enjoy listening to classical music. => 40,2103,4764,220,1353,13735,1318,13 +We need to buy groceries for the week. => 4360,643,220,1353,2256,31391,337,220,3322,1243,13 +The dog is chasing its tail in circles. => 2278,3000,307,17876,1080,220,14430,294,13040,13 +She is wearing a beautiful red dress. => 9526,307,4769,257,2238,2182,5231,13 +He is a talented actor in Hollywood. => 5205,307,257,220,32831,6003,8747,294,11628,13 +The children are playing in the playground. => 2278,2227,366,2433,294,220,3322,24646,13 +I'm going to visit my grandparents this weekend. => 40,478,516,220,1353,3441,452,21876,220,11176,6711,13 +The coffee tastes bitter without sugar. => 2278,4982,220,83,40246,13871,1553,5076,13 +They are planning a surprise party for her. => 8829,366,5038,257,6365,3595,337,720,13 +She sings like an angel on stage. => 9526,23250,411,364,14250,322,3233,13 +We should take a vacation to relax. => 4360,820,220,27612,257,12830,220,1353,5789,13 +He is studying medicine at the university. => 5205,307,7601,7195,412,220,3322,5454,13 +The rain is pouring heavily outside. => 2278,4830,307,20450,10950,2380,13 +I enjoy watching romantic movies. => 40,2103,1976,13590,6233,13 +They are celebrating their anniversary today. => 8829,366,15252,220,3322,347,12962,220,83,378,320,13 +She dances gracefully to the music. => 9526,28322,10042,2277,220,1353,220,3322,1318,13 +He is an excellent basketball player. => 5205,307,364,7103,11767,4256,13 +The baby is sleeping soundly in the crib. => 2278,3186,307,8296,1626,356,294,220,3322,47163,13 +I need to finish my homework before dinner. => 40,643,220,1353,2413,452,14578,949,6148,13 +They are organizing a charity event next month. => 8829,366,17608,257,16863,2280,958,1618,13 +She is cooking a delicious meal for us. => 9526,307,6361,257,4809,6791,337,505,13 +We should go hiking in the mountains. => 4360,820,352,23784,294,220,3322,10233,13 +The car broke down on the way to work. => 2278,1032,6902,760,322,220,3322,636,220,1353,589,13 +He loves playing video games in his free time. => 5205,6752,2433,960,2813,294,702,1737,220,3766,13 +The birds are chirping in the trees. => 2278,9009,366,36682,294,220,3322,220,3599,279,13 +I want to learn how to play the piano. => 40,528,220,1353,1466,577,220,1353,862,220,3322,9211,13 +They are building a new shopping mall in the city. => 8829,366,2390,257,777,8688,16026,294,220,3322,2307,13 +She is writing a novel in her spare time. => 9526,307,3579,257,7613,294,720,13798,220,3766,13 +We are going to the zoo this Saturday. => 4360,366,516,220,1353,220,3322,25347,220,11176,8803,13 +The cake looks delicious with chocolate frosting. => 2278,5908,1542,4809,365,6215,37048,13 +He is a talented painter who sells his artwork. => 5205,307,257,220,32831,6003,26619,567,20897,702,15829,13 +The students are studying for their exams. => 2278,1731,366,7601,337,220,3322,347,20514,13 +I enjoy swimming in the ocean. => 40,2103,11989,294,220,3322,7810,13 +They are renovating their house. => 8829,366,18845,990,220,3322,347,1782,13 +She is practicing yoga to stay healthy. => 9526,307,11350,15128,220,1353,1754,4627,13 +We should plant flowers in the garden. => 4360,820,3709,8085,294,220,3322,7431,13 +The traffic is heavy during rush hour. => 2278,220,17227,3341,307,4676,1830,9300,1773,13 +He is a skilled chef who creates amazing dishes. => 5205,307,257,19690,10530,567,7829,2243,10814,13 +The baby is crawling on the floor. => 2278,3186,307,32979,322,220,3322,4123,13 +I need to buy a new pair of shoes. => 40,643,220,1353,2256,257,777,6119,295,6654,13 +They are going on a road trip across the country. => 8829,366,516,322,257,3060,220,83,8400,2108,220,3322,1941,13 +She is playing the piano beautifully. => 9526,307,2433,220,3322,9211,16525,13 +We are going to a concert tomorrow night. => 4360,366,516,220,1353,257,8543,220,83,298,3162,1818,13 +The cake tastes delicious with vanilla frosting. => 2278,5908,220,83,40246,4809,365,17528,37048,13 +He is a dedicated teacher who inspires his students. => 5205,307,257,8374,220,975,4062,567,32566,702,1731,13 +The students are participating in a science fair. => 2278,1731,366,13950,294,257,3497,3143,13 +I enjoy hiking in the mountains. => 40,2103,23784,294,220,3322,10233,13 +They are organizing a beach cleanup next weekend. => 8829,366,17608,257,7534,40991,958,6711,13 +She is taking photographs of nature. => 9526,307,220,48625,17649,295,3687,13 +We should try a new restaurant in town. => 4360,820,220,83,627,257,777,6383,294,220,30401,13 +The traffic is moving slowly on the highway. => 2278,220,17227,3341,307,2684,5692,322,220,3322,17205,13 +He is a talented singer with a beautiful voice. => 5205,307,257,220,32831,6003,11564,365,257,2238,3177,13 +The baby is laughing and giggling. => 2278,3186,307,5059,293,290,24542,13 +I need to do laundry and wash my clothes. => 40,643,220,1353,360,19811,293,5675,452,5534,13 +They are planning a trip to Europe. => 8829,366,5038,257,220,83,8400,220,1353,3315,13 +She is learning how to play the guitar. => 9526,307,2539,577,220,1353,862,220,3322,7531,13 +We are going to a museum this Sunday. => 4360,366,516,220,1353,257,8441,220,11176,7776,13 +The coffee smells amazing in the morning. => 2278,4982,10036,2243,294,220,3322,2446,13 +He is a hardworking farmer who grows crops. => 5205,307,257,1152,22475,17891,567,13156,16829,13 +The students are presenting their research projects. => 2278,1731,366,15578,220,3322,347,2132,4455,13 +I enjoy playing soccer with my friends. => 40,2103,2433,15469,365,452,1855,13 +They are volunteering at a local shelter. => 8829,366,33237,412,257,2654,13341,13 +She is practicing martial arts for self-defense. => 9526,307,11350,20755,8609,337,2698,12,49268,13 +We should try a new recipe for dinner. => 4360,820,220,83,627,257,777,6782,337,6148,13 +The traffic is congest => 2278,220,17227,3341,307,31871 +The sun is shining brightly today. => 2278,3295,307,18269,47418,220,83,378,320,13 +I enjoy reading books in my free time. => 40,2103,3760,3642,294,452,1737,220,3766,13 +She plays the piano beautifully. => 9526,5749,220,3322,9211,16525,13 +The cat chased the mouse around the room. => 2278,3857,33091,220,3322,9719,926,220,3322,1808,13 +I love eating pizza with extra cheese. => 40,959,3936,8298,365,2857,5399,13 +He always wears a hat wherever he goes. => 5205,1009,20877,257,2385,8660,415,1709,13 +The flowers in the garden are blooming. => 2278,8085,294,220,3322,7431,366,45294,13 +She danced gracefully on the stage. => 9526,32909,10042,2277,322,220,3322,3233,13 +The dog barked loudly in the park. => 2278,3000,16202,292,22958,294,220,3322,3884,13 +We went swimming in the ocean yesterday. => 4360,1437,11989,294,220,3322,7810,5186,13 +He speaks fluent French and Spanish. => 5205,10789,40799,5522,293,8058,13 +The train arrived at the station on time. => 2278,220,83,7146,6678,412,220,3322,5214,322,220,3766,13 +She cooked a delicious meal for her family. => 9526,9267,257,4809,6791,337,720,1605,13 diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index c2f144c9..e10da392 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -764,4 +764,4 @@ int main(int argc, char ** argv) { ggml_free(model.ctx); return 0; -} +} \ No newline at end of file diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp index 2a6be4ea..67e50782 100644 --- a/examples/starcoder/main.cpp +++ b/examples/starcoder/main.cpp @@ -758,6 +758,8 @@ int main(int argc, char ** argv) { } t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); } int n_past = 0;