diff --git a/gen_unicode.cc b/gen_unicode.cc new file mode 100644 index 0000000..4e1ff68 --- /dev/null +++ b/gen_unicode.cc @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#include +#include +#include + +using code_t = std::uint32_t; +using code_vec = std::vector; + +code_t hex_to_code(ostd::string_range hs) { + code_t ret = 0; + for (char c: hs) { + if (!std::isxdigit(c |= 32)) { + throw std::runtime_error{"malformed code point"}; + } + ret = ret * 16 + (c - ((c > '9') ? ('a' - 10) : '0')); + } + return ret; +} + +struct parse_state { + code_vec controls; + code_vec alphas; + code_vec lowers; + code_vec uppers; + code_vec tolowers; + code_vec touppers; + code_vec titles; + code_vec digits; + code_vec spaces; + + void assert_line(bool b) { + if (!b) { + throw std::runtime_error{"malformed line"}; + } + } + + void parse_line(ostd::string_range line) { + std::array bits; + for (std::size_t n = 0;;) { + auto sc = ostd::find(line, ';'); + if (!sc) { + assert_line(n >= (bits.size() - 1)); + bits[n] = line; + break; + } + bits[n++] = line.slice(0, sc.data() - line.data()); + sc.pop_front(); + line = sc; + } + assert_line(!bits[0].empty() && (bits[2].size() == 2)); + code_t n = hex_to_code(bits[0]); + + /* control chars */ + if (bits[2] == "Cc") { + controls.push_back(n); + return; + } + /* alphabetics */ + if (bits[2][0] == 'L') { + alphas.push_back(n); + /* can match more */ + } + /* lowercase */ + if (bits[2] == "Ll") { + lowers.push_back(n); + touppers.push_back(bits[12].empty() ? n : hex_to_code(bits[12])); + return; + } + /* uppercase */ + if (bits[2] == "Lu") { + uppers.push_back(n); + tolowers.push_back(bits[13].empty() ? n : hex_to_code(bits[13])); + return; + } + /* titlecase */ + if (bits[2] == "Lt") { + titles.push_back(n); + return; + } + /* digits */ + if (bits[2] == "Nd") { + digits.push_back(n); + return; + } + /* whitespace */ + if ( + (bits[2][0] == 'Z') && + (bits[4] == "B" || bits[4] == "S" || bits[4] == "WS") + ) { + spaces.push_back(n); + return; + } + /* good enough for now, ignore the rest */ + } +}; + +int main(int argc, char **argv) { + if (argc <= 1) { + ostd::writeln("not enough arguments"); + return 1; + } + + ostd::string_range fname = argv[1]; + ostd::file_stream f{fname}; + if (!f.is_open()) { + ostd::writefln("cannot open file '%s'", fname); + return 1; + } + + parse_state ps; + + for (auto const &line: f.iter_lines()) { + ps.parse_line(line); + } +}