In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
- I don't know yet how much the quantization affects the quality of the generated text
- Probably the token sampling can be improved
-- No Windows support
- x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon
-
+
}
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
- auto res = gpt_tokenize(vocab, text);
+ //auto res = gpt_tokenize(vocab, text);
+
+ //if (bos) {
+ // res.insert(res.begin(), 1); // TODO: replace with vocab.bos
+ //}
+
+ std::vector<gpt_vocab::id> res;
if (bos) {
- res.insert(res.begin(), 1); // TODO: replace with vocab.bos
+ res.push_back(1); // TODO: replace with vocab.bos
}
- //std::vector<gpt_vocab::id> res;
+ //find the longest token that matches the text
+ int pos = 0;
+ while (true) {
+ int l = 0;
+ int t = 0;
+ for (const auto & kv : vocab.id_to_token) {
+ if (kv.second.size() < l) continue;
+ if (kv.second.size() > text.size() - pos) continue;
+ if (text.substr(pos, kv.second.size()) == kv.second) {
+ l = kv.second.size();
+ t = kv.first;
+ }
+ }
- //if (bos) {
- // res.push_back(1); // TODO: replace with vocab.bos
- //}
+ if (l == 0 && t != 13) {
+ break;
+ }
- // find the longest token that matches the text
- //int pos = 0;
- //while (true) {
- // int l = 0;
- // int t = 0;
- // for (const auto & kv : vocab.id_to_token) {
- // if (kv.second.size() < l) continue;
- // if (kv.second.size() > text.size() - pos) continue;
- // if (text.substr(pos, kv.second.size()) == kv.second) {
- // l = kv.second.size();
- // t = kv.first;
- // }
- // }
-
- // if (l == 0 && t != 13) {
- // break;
- // }
-
- // res.push_back(t);
- // pos += l;
- //}
+ res.push_back(t);
+ pos += l;
+ }
return res;
}
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
- int32_t n_predict = 200; // new tokens to predict
+ int32_t n_predict = 128; // new tokens to predict
// sampling parameters
- int32_t top_k = 100;
+ int32_t top_k = 40;
float top_p = 0.95f;
- float temp = 0.8f;
+ float temp = 0.80f;
int32_t n_batch = 8; // batch size for prompt processing