diff --git a/.gitignore b/.gitignore index 0ecf603..b44016f 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ examples/stream1 examples/stream2 test_runner doc/output +src/string_utf.hh diff --git a/build.cc b/build.cc index 13952cd..4ea5314 100644 --- a/build.cc +++ b/build.cc @@ -29,6 +29,9 @@ namespace fs = ostd::filesystem; #include "src/process.cc" #include "src/filesystem.cc" +#define OSTD_GEN_UNICODE_INCLUDE +#include "gen_unicode.cc" + using strvec = std::vector; using pathvec = std::vector; @@ -50,6 +53,9 @@ static pathvec TEST_CASES = { "algorithm", "range" }; +static fs::path OSTD_UNICODE_DATA = "data/UnicodeData-10.0.txt"; +static fs::path OSTD_UNICODE_SRC = CXX_SOURCE_DIR / "string_utf.hh"; + static fs::path OSTD_SHARED_LIB = "libostd.so"; static fs::path OSTD_STATIC_LIB = "libostd.a"; @@ -261,6 +267,7 @@ int main(int argc, char **argv) { rp.replace_filename(cso.string() + "_dyn.o"); try_remove(rp); } + try_remove(OSTD_UNICODE_SRC); try_remove(OSTD_STATIC_LIB); try_remove(OSTD_SHARED_LIB); try_remove("test_runner.o"); @@ -495,6 +502,11 @@ int main(int argc, char **argv) { } }; + echo_q("Generating Unicode tables..."); + ostd::unicode_gen::parse_state{}.build_all_from_file( + OSTD_UNICODE_DATA.string(), OSTD_UNICODE_SRC.string() + ); + echo_q("Building the library..."); build_all(ASM_SOURCES, ASM_SOURCE_DIR, ".S", call_as); build_all(cxx_sources.get(), fs::path{}, ".cc", call_cxx); diff --git a/gen_unicode.cc b/gen_unicode.cc index 998b1f7..e9e1836 100644 --- a/gen_unicode.cc +++ b/gen_unicode.cc @@ -9,20 +9,23 @@ #include #include +#include #include #include #include +#include #include #include #include -using ostd::string_range; +namespace ostd { +namespace unicode_gen { using code_t = std::uint32_t; using code_vec = std::vector; -code_t hex_to_code(string_range hs) { +inline code_t hex_to_code(string_range hs) { code_t ret = 0; for (char c: hs) { if (!std::isxdigit(c |= 32)) { @@ -65,7 +68,6 @@ struct parse_state { } assert_line(!bits[0].empty() && (bits[2].size() == 2)); code_t n = hex_to_code(bits[0]); - /* control chars */ if (bits[2] == "Cc") { controls.push_back(n); @@ -109,7 +111,11 @@ struct parse_state { /* good enough for now, ignore the rest */ } - void build(code_vec const &codes, code_vec const &cases = code_vec{}) { + template + void build( + R &writer, string_range name, + code_vec const &codes, code_vec const &cases = code_vec{} + ) { code_vec singles; code_vec singles_cases; code_vec ranges_beg; @@ -134,14 +140,18 @@ struct parse_state { }; auto match_lace = [ &codes, &cases, &match_pair - ](std::size_t i, int off) { + ](std::size_t i, std::size_t offs) { + int off = (!int(offs) * 2) - 1; return match_pair(i, 2) && (cases.empty() || ( (cases[i + 1] == (codes[i + 1] + off)) && (cases[i ] == (codes[i ] + off)) )); }; - for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) { + bool endseq = false; + std::size_t i = 0; + std::size_t ncodes = codes.size(); + while (i < ncodes) { if (match_range(i)) { ranges_beg.push_back(codes[i]); if (!cases.empty()) { @@ -153,42 +163,54 @@ struct parse_state { } /* end of range, try others */ ranges_end.push_back(codes[i]); + endseq = true; continue; } - if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) { + if (size_t j = 0; match_lace(i, j) || match_lace(i, ++j)) { laces_beg[j].push_back(codes[i]); for (++i; match_lace(i, j); ++i) { continue; } laces_end[j].push_back(codes[i]); + endseq = true; continue; } - singles.push_back(codes[i]); - if (!cases.empty()) { - singles_cases.push_back(cases[i]); + if (!endseq) { + singles.push_back(codes[i]); + if (!cases.empty()) { + singles_cases.push_back(cases[i]); + } } + endseq = false; + ++i; } - auto build_list = []( - string_range name, std::size_t ncol, + auto build_list = [&writer, &name]( + string_range aname, std::size_t ncol, code_vec const &col1, code_vec const &col2, code_vec const &col3 ) { if (col1.empty()) { return; } - ostd::writefln("%s:", name); - for (std::size_t i = 0; i < col1.size(); ++i) { + format( + writer, "static char32_t %s_%s[][%d] = {\n", + name, aname, ncol + ); + for (std::size_t j = 0; j < col1.size(); ++j) { switch (ncol) { case 1: - ostd::writefln(" 0x%06X", col1[i]); + format(writer, " { 0x%06X },\n", col1[j]); break; case 2: - ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]); + format( + writer, " { 0x%06X, 0x%06X },\n", + col1[j], col2[j] + ); break; case 3: - ostd::writefln( - " 0x%06X, 0x%06X, 0x%06X", - col1[i], col2[i], col3[i] + format( + writer, " { 0x%06X, 0x%06X, 0x%06X },\n", + col1[j], col2[j], col3[j] ); break; default: @@ -196,8 +218,15 @@ struct parse_state { break; } } + format(writer, "};\n\n"); }; + if (cases.empty()) { + format(writer, "\n/* is%s */\n\n", name); + } else { + format(writer, "\n/* is%s, to%s */\n\n", name, name); + } + build_list( "ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases ); @@ -206,45 +235,132 @@ struct parse_state { build_list( "singles", !cases.empty() + 1, singles, singles_cases, singles ); + + /* is_CTYPE(c) */ + build_func( + writer, name, "is", "bool", + ranges_beg, laces_beg[0], laces_beg[1], singles + ); + + /* to_CTYPE(c) */ + if (!cases.empty()) { + build_func( + writer, name, "to", "char32_t", + ranges_beg, laces_beg[0], laces_beg[1], singles + ); + } + } + + template + void build_header(R &writer) { + char buf[64]; + time_t curt; + std::time(&curt); + strftime(buf, sizeof(buf), "%c", std::localtime(&curt)); + format( + writer, "/* Generated on %s by gen_unicode (libostd) */\n", + static_cast(buf) + ); + } + + template + void build_func( + R &writer, + string_range name, + string_range prefix, + string_range ret_type, + code_vec const &ranges, + code_vec const &laces1, + code_vec const &laces2, + code_vec const &singles + ) { + format( + writer, "%s %s%s(char32_t c) {\n", ret_type, prefix, name + ); + format(writer, " return utf::uctype_func<\n"); + auto it1 = { &ranges, &laces1, &laces2, &singles }; + auto it2 = { "ranges", "laces1", "laces2", "singles" }; + for (std::size_t i = 0; i < it1.size(); ++i) { + if (it1.begin()[i]->empty()) { + format(writer, " 0, 0"); + } else { + format( + writer, " sizeof(%s_%s), sizeof(*%s_%s)", + name, it2.begin()[i], name, it2.begin()[i] + ); + } + if ((i + 1) != it1.size()) { + format(writer, ",\n"); + } else { + format(writer, "\n"); + } + } + format(writer, " >::do_%s(\n c, ", prefix); + for (std::size_t i = 0; i < it1.size(); ++i) { + if (i != 0) { + format(writer, ", "); + } + if (it1.begin()[i]->empty()) { + format(writer, "nullptr"); + } else { + format(writer, "%s_%s", name, it2.begin()[i]); + } + } + format(writer, "\n );\n}\n\n"); + } + + template + void build_all(R &writer, IR lines) { + for (auto const &line: lines) { + parse_line(line); + } + + build_header(writer); + + build(writer, "alpha", alphas); + build(writer, "cntrl", controls); + build(writer, "digit", digits); + build(writer, "lower", lowers, touppers); + build(writer, "space", spaces); + build(writer, "title", titles); + build(writer, "upper", uppers, tolowers); + } + + void build_all_from_file(string_range input, string_range output) { + file_stream ifs{input}; + if (!ifs.is_open()) { + throw std::runtime_error{"could not open input file"}; + } + file_stream ofs{output, stream_mode::WRITE}; + if (!ofs.is_open()) { + throw std::runtime_error{"could not open output file"}; + } + auto writer = ofs.iter(); + build_all(writer, ifs.iter_lines()); } }; +} /* namespace unicode_gen */ +} /* namespace ostd */ + +#ifndef OSTD_GEN_UNICODE_INCLUDE int main(int argc, char **argv) { if (argc <= 1) { ostd::writeln("not enough arguments"); return 1; } - - string_range fname = argv[1]; - ostd::file_stream f{fname}; - if (!f.is_open()) { - ostd::writefln("cannot open file '%s'", fname); - return 1; + ostd::string_range fname = argv[1]; + ostd::string_range outname = "src/string_utf.hh"; + if (argc >= 3) { + outname = argv[2]; } - parse_state ps; - + ostd::unicode_gen::parse_state ps; try { - for (auto const &line: f.iter_lines()) { - ps.parse_line(line); - } + ps.build_all_from_file(fname, outname); } catch (std::runtime_error const &e) { ostd::writeln(e.what()); return 1; } - - ostd::writeln("ALPHAS:"); - ps.build(ps.alphas); - ostd::writeln("CONTROL:"); - ps.build(ps.controls); - ostd::writeln("DIGITS:"); - ps.build(ps.digits); - ostd::writeln("LOWERCASE:"); - ps.build(ps.lowers, ps.touppers); - ostd::writeln("SPACES:"); - ps.build(ps.spaces); - ostd::writeln("TITLES:"); - ps.build(ps.titles); - ostd::writeln("UPPERCASE:"); - ps.build(ps.uppers, ps.tolowers); } +#endif diff --git a/ostd/string.hh b/ostd/string.hh index 3748926..5d03e8f 100644 --- a/ostd/string.hh +++ b/ostd/string.hh @@ -888,6 +888,23 @@ namespace utf { return detail::codepoint_range{r}; } + bool isalnum(char32_t c); + bool isalpha(char32_t c); + bool isblank(char32_t c); + bool iscntrl(char32_t c); + bool isdigit(char32_t c); + bool isgraph(char32_t c); + bool islower(char32_t c); + bool isprint(char32_t c); + bool ispunct(char32_t c); + bool isspace(char32_t c); + bool istitle(char32_t c); + bool isupper(char32_t c); + bool isvalid(char32_t c); + bool isxdigit(char32_t c); + char32_t tolower(char32_t c); + char32_t toupper(char32_t c); + /** @} */ } /* namespace utf */ diff --git a/src/string.cc b/src/string.cc index faaadc2..5adeaa0 100644 --- a/src/string.cc +++ b/src/string.cc @@ -4,6 +4,7 @@ */ #include +#include #include "ostd/string.hh" namespace ostd { @@ -114,5 +115,193 @@ std::size_t length(string_range r, string_range &cont) noexcept { return ret; } +/* unicode-aware ctype + * the other ones use custom tables for lookups + */ + +bool isalnum(char32_t c) { + return (utf::isalpha(c) || utf::isdigit(c)); +} + +bool isblank(char32_t c) { + return ((c == ' ') || (c == '\t')); +} + +bool isgraph(char32_t c) { + return (!utf::isspace(c) && utf::isprint(c)); +} + +bool isprint(char32_t c) { + switch (std::uint32_t(c)) { + case 0x2028: + case 0x2029: + case 0xFFF9: + case 0xFFFA: + case 0xFFFB: + return false; + default: + return !utf::iscntrl(c); + } +} + +bool ispunct(char32_t c) { + return (utf::isgraph(c) && !utf::isalnum(c)); +} + +bool isvalid(char32_t c) { + /* surrogate code points */ + if ((c >= 0xD800) && (c <= 0xDFFF)) { + return false; + } + /* non-characters */ + if ((c >= 0xFDD0) && (c <= 0xFDEF)) { + return false; + } + /* end of plane */ + if ((c & 0xFFFE) == 0xFFFE) { + return false; + } + /* must be within range */ + return (c <= MaxCodepoint); +} + +bool isxdigit(char32_t c) { + if ((c >= '0') && (c <= '9')) { + return true; + } + auto uc = std::uint32_t(c) | 32; + return ((uc >= 'a') && (uc <= 'f')); +} + +inline int codepoint_cmp1(void const *a, void const *b) { + char32_t c1 = *static_cast(a); + char32_t c2 = *static_cast(b); + return (c1 - c2); +} + +inline int codepoint_cmp2(void const *a, void const *b) { + char32_t c = *static_cast(a); + char32_t const *p = static_cast(b); + if ((c >= p[0]) && (c <= p[1])) { + return 0; + } + return (c - p[0]); +} + +template< + std::size_t RangesN, std::size_t RangesS, + std::size_t Laces1N, std::size_t Laces1S, + std::size_t Laces2N, std::size_t Laces2S, + std::size_t SinglesN, std::size_t SinglesS +> +struct uctype_func { + static bool do_is( + char32_t c, + void const *ranges [[maybe_unused]], + void const *laces1 [[maybe_unused]], + void const *laces2 [[maybe_unused]], + void const *singles [[maybe_unused]] + ) { + if constexpr(RangesN != 0) { + char32_t *found = static_cast(std::bsearch( + &c, ranges, RangesN / RangesS, RangesS / sizeof(char32_t), + codepoint_cmp2 + )); + if (found) { + return true; + } + } + if constexpr(Laces1N != 0) { + char32_t *found = static_cast(std::bsearch( + &c, laces1, Laces1N / Laces1S, Laces1S / sizeof(char32_t), + codepoint_cmp2 + )); + if (found) { + return !((c - found[0]) % 2); + } + } + if constexpr(Laces2N != 0) { + char32_t *found = static_cast(std::bsearch( + &c, laces2, Laces2N / Laces2S, Laces2S / sizeof(char32_t), + codepoint_cmp2 + )); + if (found) { + return !((c - found[0]) % 2); + } + } + if constexpr(SinglesN != 0) { + char32_t *found = static_cast(std::bsearch( + &c, singles, SinglesN / SinglesS, SinglesS / sizeof(char32_t), + codepoint_cmp1 + )); + if (found) { + return true; + } + } + return false; + } + + static char32_t do_to( + char32_t c, + void const *ranges [[maybe_unused]], + void const *laces1 [[maybe_unused]], + void const *laces2 [[maybe_unused]], + void const *singles [[maybe_unused]] + ) { + if constexpr(RangesN != 0) { + char32_t *found = static_cast(std::bsearch( + &c, ranges, RangesN >> 4, RangesN & 0xF, codepoint_cmp2 + )); + if (found) { + return (found[2] + (c - found[0])); + } + } + if constexpr(Laces1N != 0) { + char32_t *found = static_cast(std::bsearch( + &c, laces1, Laces1N >> 4, Laces1N & 0xF, codepoint_cmp2 + )); + if (found) { + if ((c - found[0]) % 2) { + return c; + } + return c + 1; + } + } + if constexpr(Laces2N != 0) { + char32_t *found = static_cast(std::bsearch( + &c, laces2, Laces2N >> 4, Laces2N & 0xF, codepoint_cmp2 + )); + if (found) { + if ((c - found[0]) % 2) { + return c; + } + return c - 1; + } + } + if constexpr(SinglesN != 0) { + char32_t *found = static_cast(std::bsearch( + &c, singles, SinglesN >> 4, SinglesN & 0xF, codepoint_cmp1 + )); + if (found) { + return found[1]; + } + } + return c; + } +}; + +/* these are geneated */ +bool isalpha(char32_t c); +bool iscntrl(char32_t c); +bool isdigit(char32_t c); +bool islower(char32_t c); +bool isspace(char32_t c); +bool istitle(char32_t c); +bool isupper(char32_t c); +char32_t tolower(char32_t c); +char32_t toupper(char32_t c); + +#include "string_utf.hh" + } /* namespace utf */ } /* namespace ostd */