Final touches

author Georgi Gerganov <redacted>

Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)

committer Georgi Gerganov <redacted>

Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)
author Georgi Gerganov <redacted>
Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)
committer Georgi Gerganov <redacted>
Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)
diff --git a/README.md b/README.md

index 87808fd969e82ec502524c3a79bc28b9dd1defdf..d2b9a70e5b5e7610b6a9f89715bb53f016c9357c 100644 (file)
--- a/README.md
+++ b/README.md
@@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1
    In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
  - I don't know yet how much the quantization affects the quality of the generated text
  - Probably the token sampling can be improved
-- No Windows support
  - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon
-  
+
diff --git a/main.cpp b/main.cpp

index fb9eb17b17183f078fe9e36ba57d5fda4087974d..982adf16513d1d6b2bb2fc0ca823f742d0e4f286 100644 (file)
--- a/main.cpp
+++ b/main.cpp
@@ -728,6 +728,7 @@ int main(int argc, char ** argv) {
  
          // end of text token
          if (embd.back() == 2) {
+            printf(" [end of text]\n");
              break;
          }
      }
diff --git a/models/.gitignore b/models/.gitignore

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/utils.cpp b/utils.cpp

index 70a2ac2db47974726aa00e2c2b6f2d9c47aae9b3..cd9c001576380437bd8145f29e371ab01428d8a6 100644 (file)
--- a/utils.cpp
+++ b/utils.cpp
@@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
  }
  
  std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    auto res = gpt_tokenize(vocab, text);
+    //auto res = gpt_tokenize(vocab, text);
+
+    //if (bos) {
+    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
+    //}
+
+    std::vector<gpt_vocab::id> res;
  
      if (bos) {
-        res.insert(res.begin(), 1); // TODO: replace with vocab.bos
+        res.push_back(1); // TODO: replace with vocab.bos
      }
  
-    //std::vector<gpt_vocab::id> res;
+     //find the longest token that matches the text
+    int pos = 0;
+    while (true) {
+        int l = 0;
+        int t = 0;
+        for (const auto & kv : vocab.id_to_token) {
+            if (kv.second.size() < l) continue;
+            if (kv.second.size() > text.size() - pos) continue;
+            if (text.substr(pos, kv.second.size()) == kv.second) {
+                l = kv.second.size();
+                t = kv.first;
+            }
+        }
  
-    //if (bos) {
-    //    res.push_back(1); // TODO: replace with vocab.bos
-    //}
+        if (l == 0 && t != 13) {
+            break;
+        }
  
-    // find the longest token that matches the text
-    //int pos = 0;
-    //while (true) {
-    //    int l = 0;
-    //    int t = 0;
-    //    for (const auto & kv : vocab.id_to_token) {
-    //        if (kv.second.size() < l) continue;
-    //        if (kv.second.size() > text.size() - pos) continue;
-    //        if (text.substr(pos, kv.second.size()) == kv.second) {
-    //            l = kv.second.size();
-    //            t = kv.first;
-    //        }
-    //    }
-
-    //    if (l == 0 && t != 13) {
-    //        break;
-    //    }
-
-    //    res.push_back(t);
-    //    pos += l;
-    //}
+        res.push_back(t);
+        pos += l;
+    }
  
      return res;
  }
diff --git a/utils.h b/utils.h

index d291964a5e4825deb3d88dbd0d002566749bc6c6..20c42ba9c7e043d7e4d07394ec36694d36097202 100644 (file)
--- a/utils.h
+++ b/utils.h
@@ -15,12 +15,12 @@
  struct gpt_params {
      int32_t seed      = -1; // RNG seed
      int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict = 200; // new tokens to predict
+    int32_t n_predict = 128; // new tokens to predict
  
      // sampling parameters
-    int32_t top_k = 100;
+    int32_t top_k = 40;
      float   top_p = 0.95f;
-    float   temp  = 0.8f;
+    float   temp  = 0.80f;
  
      int32_t n_batch = 8; // batch size for prompt processing
author	Georgi Gerganov <redacted>
	Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 10 Mar 2023 19:50:46 +0000 (21:50 +0200)
README.md		patch \| blob \| history
main.cpp		patch \| blob \| history
models/.gitignore	[new file with mode: 0644]	patch \| blob
utils.cpp		patch \| blob \| history
utils.h		patch \| blob \| history