throw std::invalid_argument("failed to convert utf8 to codepoint");
}
-//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
// std::vector<uint16_t> result;
-// if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-// result.emplace_back(cp);
+// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+// result.emplace_back(cpt);
// return result;
// }
-// if (0x10000 <= cp && cp <= 0x10ffff) {
-// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+// if (0x10000 <= cpt && cpt <= 0x10ffff) {
+// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
// return result;
// }
// throw std::invalid_argument("failed to convert codepoint to utf16");
// return result;
//}
-static std::vector<codepoint_flags> unicode_cpt_flags_array() {
- std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
+static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
+ std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
assert (unicode_ranges_flags.begin()[0].first == 0);
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
};
size_t _prev_end = offset_ini;
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
};
size_t _prev_end = offset_ini;
// interface
//
-std::string unicode_cpt_to_utf8(uint32_t cp) {
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
std::string result;
- if (/* 0x00 <= cp && */ cp <= 0x7f) {
- result.push_back(cp);
+ if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+ result.push_back(cpt);
return result;
}
- if (0x80 <= cp && cp <= 0x7ff) {
- result.push_back(0xc0 | ((cp >> 6) & 0x1f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x80 <= cpt && cpt <= 0x7ff) {
+ result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
- if (0x800 <= cp && cp <= 0xffff) {
- result.push_back(0xe0 | ((cp >> 12) & 0x0f));
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x800 <= cpt && cpt <= 0xffff) {
+ result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
- if (0x10000 <= cp && cp <= 0x10ffff) {
- result.push_back(0xf0 | ((cp >> 18) & 0x07));
- result.push_back(0x80 | ((cp >> 12) & 0x3f));
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x10000 <= cpt && cpt <= 0x10ffff) {
+ result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+ result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
return result;
}
-codepoint_flags unicode_cpt_flags(const uint32_t cp) {
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
static const auto cpt_flags = unicode_cpt_flags_array();
- return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
+ return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
}
-codepoint_flags unicode_cpt_flags(const std::string & utf8) {
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
if (utf8.empty()) {
return undef; // undefined
}
size_t offset = 0;
- return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
+ return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
}
std::string unicode_byte_to_utf8(uint8_t byte) {
return map.at(utf8);
}
-uint32_t unicode_tolower(uint32_t cp) {
+uint32_t unicode_tolower(uint32_t cpt) {
// binary search
- auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+ auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
return pair.first < value;
});
- if (it != unicode_map_lowercase.end() && it->first == cp) {
+ if (it != unicode_map_lowercase.end() && it->first == cpt) {
return it->second;
}
- return cp; // Return the original code point if no lowercase mapping is found
+ return cpt; // Return the original code point if no lowercase mapping is found
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
- { "\\p{N}", codepoint_flags::NUMBER },
- { "\\p{L}", codepoint_flags::LETTER },
- { "\\p{P}", codepoint_flags::PUNCTUATION },
+ { "\\p{N}", unicode_cpt_flags::NUMBER },
+ { "\\p{L}", unicode_cpt_flags::LETTER },
+ { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
};
static const std::map<int, int> k_ucat_cpt = {
- { codepoint_flags::NUMBER, 0xD1 },
- { codepoint_flags::LETTER, 0xD2 },
- { codepoint_flags::PUNCTUATION, 0xD3 },
+ { unicode_cpt_flags::NUMBER, 0xD1 },
+ { unicode_cpt_flags::LETTER, 0xD2 },
+ { unicode_cpt_flags::PUNCTUATION, 0xD3 },
};
static const std::map<int, std::string> k_ucat_map = {
- { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
- { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
- { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+ { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
+ { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
+ { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
};
// compute collapsed codepoints only if needed by at least one regex
bool need_collapse = false;
- for (auto & regex_expr : regex_exprs) {
+ for (const auto & regex_expr : regex_exprs) {
// search for unicode categories
for (const auto & ucat : k_ucat_enum) {
if (std::string::npos != regex_expr.find(ucat.first)) {
continue;
}
- const auto flags = unicode_cpt_flags(cpts[i]);
+ const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
if (flags.is_whitespace) {
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
std::vector<size_t> bpe_offsets = { cpts.size() };
- for (auto & regex_expr : regex_exprs) {
+ for (const auto & regex_expr : regex_exprs) {
// first, see if we have an efficient custom regex implementation
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
// with the corresponding collapsed representation
bool use_collapsed = false;
- for (auto & ucat : k_ucat_enum) {
+ for (const auto & ucat : k_ucat_enum) {
if (std::string::npos != regex_expr.find(ucat.first)) {
use_collapsed = true;
break;
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
std::wstring wtext(cpts.begin(), cpts.end());
for (size_t i = 0; i < wtext.size(); ++i) {
- if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
wtext[i] = 0x0B;
}
}