Fixing the last deviations from sentencepiece indicated by test-tokenizer-1 (#3170)

author goerch <redacted>

Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)

committer GitHub <redacted>

Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)
author goerch <redacted>
Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)
committer GitHub <redacted>
Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index 02ec0f8d0861a3d2da69ae38a1530a2a179c09fc..6d655fd5548c5746daa75124b0609bbdeeadf736 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -801,10 +801,10 @@ std::vector<llama_token> llama_tokenize(
      // upper limit for the number of tokens
      int n_tokens = text.length() + add_bos;
      std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
      if (n_tokens < 0) {
          result.resize(-n_tokens);
-        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
          GGML_ASSERT(check == -n_tokens);
      } else {
          result.resize(n_tokens);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp

index 947aa7ed3bd3e625669dbaa4503def9760eb7051..59c90c7ba654d9c387284bb5f2df48a2d1da0bc5 100644 (file)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -965,10 +965,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
  
      buf[size] = '\0';
  
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
      if (n_tokens < 0) {
          out.resize(-n_tokens);
-        n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+        n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
      }
      GGML_ASSERT(n_tokens >= 0);
      out.resize(n_tokens);
diff --git a/llama.cpp b/llama.cpp

index a6502612232f88cff1f35a405836224489a49978..0b334b4e980e6dab950f9ddcfc7d859c95f72a64 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7032,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
  int llama_tokenize(
          struct llama_context * ctx,
                    const char * text,
+                         int   text_len,
                   llama_token * tokens,
                           int   n_max_tokens,
                          bool   add_bos) {
-    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+    return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
  }
  
  int llama_tokenize_with_model(
      const struct llama_model * model,
                    const char * text,
+                         int   text_len,
                   llama_token * tokens,
                           int   n_max_tokens,
                          bool   add_bos) {
-    auto res = llama_tokenize_internal(model->vocab, text, add_bos);
+    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
  
      if (n_max_tokens < (int) res.size()) {
          // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
diff --git a/llama.h b/llama.h

index c6ee038c727d7aa6d204e1fb7b2f88a42392d0ef..369be048c001276bcf8d3cea6ca0c5dcebd79a2c 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -374,6 +374,7 @@ extern "C" {
      LLAMA_API int llama_tokenize(
              struct llama_context * ctx,
                        const char * text,
+                             int   text_len,
                       llama_token * tokens,
                               int   n_max_tokens,
                              bool   add_bos);
@@ -381,6 +382,7 @@ extern "C" {
      LLAMA_API int llama_tokenize_with_model(
          const struct llama_model * model,
                        const char * text,
+                             int   text_len,
                       llama_token * tokens,
                               int   n_max_tokens,
                              bool   add_bos);
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp

index edbd86f8514d67447262300044f1d9c74cdd14c4..dfb2e81a9bc6ff19368c4aa4a3e5d61dc7056ea7 100644 (file)
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -36,6 +36,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
          { "   Hello"              , {    1678,  15043, }, },
          { "    Hello"             , {     268,  15043, }, },
          { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
+        { " ("                    , {   29871,  313, }, },
      };
  
      return _k_tests;
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp

index 804ea2486a67a6336d78f8bfdf819d3e664f38e6..a95d462cfcd0b181c54950ff5697c3ba5c4456f0 100644 (file)
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -87,10 +87,9 @@ int main(int argc, char **argv) {
          std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
          std::string check = llama_detokenize_spm(ctx, tokens);
          if (check != str) {
-            fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
+            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                  __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-            if(i != 3)
-                return 2;
+            return 2;
          }
      }
  
@@ -99,11 +98,10 @@ int main(int argc, char **argv) {
              std::string str = codepoint_to_utf8(cp);
              std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
              std::string check = llama_detokenize_spm(ctx, tokens);
-            if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
+            if (cp != 9601 && str != check) {
+                fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                      __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                if(cp != 0 && cp != 9601)
-                    return 3;
+                return 3;
              }
          }
      }
@@ -112,7 +110,7 @@ int main(int argc, char **argv) {
          std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
          std::string check = llama_detokenize_spm(ctx, tokens);
          if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
+            fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                  __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
              return 4;
          }
author	goerch <redacted>
	Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)
committer	GitHub <redacted>
	Sat, 16 Sep 2023 11:41:33 +0000 (13:41 +0200)
common/common.cpp		patch \| blob \| history
examples/train-text-from-scratch/train-text-from-scratch.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history
tests/test-tokenizer-0-llama.cpp		patch \| blob \| history
tests/test-tokenizer-1-llama.cpp		patch \| blob \| history