From 2b291bca3912209299507d15bc48387a7eb921d5 Mon Sep 17 00:00:00 2001 From: q66 Date: Tue, 2 Jan 2018 19:03:39 +0100 Subject: [PATCH] implement the sorting logic in unicode generator --- COPYING.md | 6 +++ build.cc | 7 ++- gen_unicode.cc | 143 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 149 insertions(+), 7 deletions(-) diff --git a/COPYING.md b/COPYING.md index 6242ab8..1bad216 100644 --- a/COPYING.md +++ b/COPYING.md @@ -32,6 +32,12 @@ SOFTWARE.** Additionally some code from the libc++ project has been used as a reference; libc++ is a part of the LLVM project. +The libutf project (https://github.com/cls/libutf) was used as an inspiration +for some of the Unicode implementation bits within libostd and the gen\_unicode +program used to generate the necessary ctype tables, but all of the code was +written from scratch; nevertheless, I would like to give some credit and thanks +to the project here. + Additional thanks to Dale Weiler aka graphitemaster (reference code in the Neothyne project) and cppreference.com. diff --git a/build.cc b/build.cc index bdb43f8..13952cd 100644 --- a/build.cc +++ b/build.cc @@ -1,4 +1,9 @@ -/* A build system for libostd. */ +/* A simple build system to build libostd. + * + * This file is a part of the libostd project. Libostd is licensed under the + * University of Illinois/NCSA Open Source License, as is this file. See the + * COPYING.md file further information. + */ /* for Windows so that we avoid dllimport/dllexport */ #define OSTD_BUILD_LIB diff --git a/gen_unicode.cc b/gen_unicode.cc index 4e1ff68..998b1f7 100644 --- a/gen_unicode.cc +++ b/gen_unicode.cc @@ -1,16 +1,28 @@ +/* This simple program generates the tables necessary for Unicode character + * types. It's inspired by the mkrunetype.awk generator from the libutf + * project, see COPYING.md. + * + * This file is a part of the libostd project. Libostd is licensed under the + * University of Illinois/NCSA Open Source License, as is this file. See the + * COPYING.md file further information. + */ + #include #include #include +#include #include #include #include #include +using ostd::string_range; + using code_t = std::uint32_t; using code_vec = std::vector; -code_t hex_to_code(ostd::string_range hs) { +code_t hex_to_code(string_range hs) { code_t ret = 0; for (char c: hs) { if (!std::isxdigit(c |= 32)) { @@ -38,8 +50,8 @@ struct parse_state { } } - void parse_line(ostd::string_range line) { - std::array bits; + void parse_line(string_range line) { + std::array bits; for (std::size_t n = 0;;) { auto sc = ostd::find(line, ';'); if (!sc) { @@ -96,6 +108,105 @@ struct parse_state { } /* good enough for now, ignore the rest */ } + + void build(code_vec const &codes, code_vec const &cases = code_vec{}) { + code_vec singles; + code_vec singles_cases; + code_vec ranges_beg; + code_vec ranges_end; + code_vec ranges_cases; + code_vec laces_beg[2]; + code_vec laces_end[2]; + + if (!cases.empty() && (cases.size() != codes.size())) { + throw std::runtime_error{"mismatched code lists"}; + } + + auto match_pair = [&codes](std::size_t i, std::size_t off) { + return ( + ((i + 1) < codes.size()) && (codes[i + 1] == (codes[i] + off)) + ); + }; + auto match_range = [&codes, &cases, &match_pair](std::size_t i) { + return match_pair(i, 1) && ( + cases.empty() || (cases[i + 1] == (cases[i] + 1)) + ); + }; + auto match_lace = [ + &codes, &cases, &match_pair + ](std::size_t i, int off) { + return match_pair(i, 2) && (cases.empty() || ( + (cases[i + 1] == (codes[i + 1] + off)) && + (cases[i ] == (codes[i ] + off)) + )); + }; + + for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) { + if (match_range(i)) { + ranges_beg.push_back(codes[i]); + if (!cases.empty()) { + ranges_cases.push_back(cases[i]); + } + /* go to the end of sequence */ + for (++i; match_range(i); ++i) { + continue; + } + /* end of range, try others */ + ranges_end.push_back(codes[i]); + continue; + } + if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) { + laces_beg[j].push_back(codes[i]); + for (++i; match_lace(i, j); ++i) { + continue; + } + laces_end[j].push_back(codes[i]); + continue; + } + singles.push_back(codes[i]); + if (!cases.empty()) { + singles_cases.push_back(cases[i]); + } + } + + auto build_list = []( + string_range name, std::size_t ncol, + code_vec const &col1, code_vec const &col2, code_vec const &col3 + ) { + if (col1.empty()) { + return; + } + ostd::writefln("%s:", name); + for (std::size_t i = 0; i < col1.size(); ++i) { + switch (ncol) { + case 1: + ostd::writefln(" 0x%06X", col1[i]); + break; + case 2: + ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]); + break; + case 3: + ostd::writefln( + " 0x%06X, 0x%06X, 0x%06X", + col1[i], col2[i], col3[i] + ); + break; + default: + throw std::runtime_error{"invalid column number"}; + break; + } + } + }; + + build_list( + "ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases + ); + build_list("laces1", 2, laces_beg[0], laces_end[0], laces_beg[0]); + build_list("laces2", 2, laces_beg[1], laces_end[1], laces_beg[1]); + build_list( + "singles", !cases.empty() + 1, singles, singles_cases, singles + ); + } }; int main(int argc, char **argv) { @@ -104,7 +215,7 @@ int main(int argc, char **argv) { return 1; } - ostd::string_range fname = argv[1]; + string_range fname = argv[1]; ostd::file_stream f{fname}; if (!f.is_open()) { ostd::writefln("cannot open file '%s'", fname); @@ -113,7 +224,27 @@ int main(int argc, char **argv) { parse_state ps; - for (auto const &line: f.iter_lines()) { - ps.parse_line(line); + try { + for (auto const &line: f.iter_lines()) { + ps.parse_line(line); + } + } catch (std::runtime_error const &e) { + ostd::writeln(e.what()); + return 1; } + + ostd::writeln("ALPHAS:"); + ps.build(ps.alphas); + ostd::writeln("CONTROL:"); + ps.build(ps.controls); + ostd::writeln("DIGITS:"); + ps.build(ps.digits); + ostd::writeln("LOWERCASE:"); + ps.build(ps.lowers, ps.touppers); + ostd::writeln("SPACES:"); + ps.build(ps.spaces); + ostd::writeln("TITLES:"); + ps.build(ps.titles); + ostd::writeln("UPPERCASE:"); + ps.build(ps.uppers, ps.tolowers); }