implement the sorting logic in unicode generator

master
Daniel Kolesa 2018-01-02 19:03:39 +01:00
parent 80aadd906e
commit 2b291bca39
3 changed files with 149 additions and 7 deletions

View File

@ -32,6 +32,12 @@ SOFTWARE.**
Additionally some code from the libc++ project has been used as a reference; Additionally some code from the libc++ project has been used as a reference;
libc++ is a part of the LLVM project. libc++ is a part of the LLVM project.
The libutf project (https://github.com/cls/libutf) was used as an inspiration
for some of the Unicode implementation bits within libostd and the gen\_unicode
program used to generate the necessary ctype tables, but all of the code was
written from scratch; nevertheless, I would like to give some credit and thanks
to the project here.
Additional thanks to Dale Weiler aka graphitemaster (reference code in the Additional thanks to Dale Weiler aka graphitemaster (reference code in the
Neothyne project) and cppreference.com. Neothyne project) and cppreference.com.

View File

@ -1,4 +1,9 @@
/* A build system for libostd. */ /* A simple build system to build libostd.
*
* This file is a part of the libostd project. Libostd is licensed under the
* University of Illinois/NCSA Open Source License, as is this file. See the
* COPYING.md file further information.
*/
/* for Windows so that we avoid dllimport/dllexport */ /* for Windows so that we avoid dllimport/dllexport */
#define OSTD_BUILD_LIB #define OSTD_BUILD_LIB

View File

@ -1,16 +1,28 @@
/* This simple program generates the tables necessary for Unicode character
* types. It's inspired by the mkrunetype.awk generator from the libutf
* project, see COPYING.md.
*
* This file is a part of the libostd project. Libostd is licensed under the
* University of Illinois/NCSA Open Source License, as is this file. See the
* COPYING.md file further information.
*/
#include <cstdint> #include <cstdint>
#include <cctype> #include <cctype>
#include <vector> #include <vector>
#include <array>
#include <stdexcept> #include <stdexcept>
#include <ostd/io.hh> #include <ostd/io.hh>
#include <ostd/string.hh> #include <ostd/string.hh>
#include <ostd/algorithm.hh> #include <ostd/algorithm.hh>
using ostd::string_range;
using code_t = std::uint32_t; using code_t = std::uint32_t;
using code_vec = std::vector<code_t>; using code_vec = std::vector<code_t>;
code_t hex_to_code(ostd::string_range hs) { code_t hex_to_code(string_range hs) {
code_t ret = 0; code_t ret = 0;
for (char c: hs) { for (char c: hs) {
if (!std::isxdigit(c |= 32)) { if (!std::isxdigit(c |= 32)) {
@ -38,8 +50,8 @@ struct parse_state {
} }
} }
void parse_line(ostd::string_range line) { void parse_line(string_range line) {
std::array<ostd::string_range, 15> bits; std::array<string_range, 15> bits;
for (std::size_t n = 0;;) { for (std::size_t n = 0;;) {
auto sc = ostd::find(line, ';'); auto sc = ostd::find(line, ';');
if (!sc) { if (!sc) {
@ -96,6 +108,105 @@ struct parse_state {
} }
/* good enough for now, ignore the rest */ /* good enough for now, ignore the rest */
} }
void build(code_vec const &codes, code_vec const &cases = code_vec{}) {
code_vec singles;
code_vec singles_cases;
code_vec ranges_beg;
code_vec ranges_end;
code_vec ranges_cases;
code_vec laces_beg[2];
code_vec laces_end[2];
if (!cases.empty() && (cases.size() != codes.size())) {
throw std::runtime_error{"mismatched code lists"};
}
auto match_pair = [&codes](std::size_t i, std::size_t off) {
return (
((i + 1) < codes.size()) && (codes[i + 1] == (codes[i] + off))
);
};
auto match_range = [&codes, &cases, &match_pair](std::size_t i) {
return match_pair(i, 1) && (
cases.empty() || (cases[i + 1] == (cases[i] + 1))
);
};
auto match_lace = [
&codes, &cases, &match_pair
](std::size_t i, int off) {
return match_pair(i, 2) && (cases.empty() || (
(cases[i + 1] == (codes[i + 1] + off)) &&
(cases[i ] == (codes[i ] + off))
));
};
for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) {
if (match_range(i)) {
ranges_beg.push_back(codes[i]);
if (!cases.empty()) {
ranges_cases.push_back(cases[i]);
}
/* go to the end of sequence */
for (++i; match_range(i); ++i) {
continue;
}
/* end of range, try others */
ranges_end.push_back(codes[i]);
continue;
}
if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) {
laces_beg[j].push_back(codes[i]);
for (++i; match_lace(i, j); ++i) {
continue;
}
laces_end[j].push_back(codes[i]);
continue;
}
singles.push_back(codes[i]);
if (!cases.empty()) {
singles_cases.push_back(cases[i]);
}
}
auto build_list = [](
string_range name, std::size_t ncol,
code_vec const &col1, code_vec const &col2, code_vec const &col3
) {
if (col1.empty()) {
return;
}
ostd::writefln("%s:", name);
for (std::size_t i = 0; i < col1.size(); ++i) {
switch (ncol) {
case 1:
ostd::writefln(" 0x%06X", col1[i]);
break;
case 2:
ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]);
break;
case 3:
ostd::writefln(
" 0x%06X, 0x%06X, 0x%06X",
col1[i], col2[i], col3[i]
);
break;
default:
throw std::runtime_error{"invalid column number"};
break;
}
}
};
build_list(
"ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases
);
build_list("laces1", 2, laces_beg[0], laces_end[0], laces_beg[0]);
build_list("laces2", 2, laces_beg[1], laces_end[1], laces_beg[1]);
build_list(
"singles", !cases.empty() + 1, singles, singles_cases, singles
);
}
}; };
int main(int argc, char **argv) { int main(int argc, char **argv) {
@ -104,7 +215,7 @@ int main(int argc, char **argv) {
return 1; return 1;
} }
ostd::string_range fname = argv[1]; string_range fname = argv[1];
ostd::file_stream f{fname}; ostd::file_stream f{fname};
if (!f.is_open()) { if (!f.is_open()) {
ostd::writefln("cannot open file '%s'", fname); ostd::writefln("cannot open file '%s'", fname);
@ -113,7 +224,27 @@ int main(int argc, char **argv) {
parse_state ps; parse_state ps;
for (auto const &line: f.iter_lines()) { try {
ps.parse_line(line); for (auto const &line: f.iter_lines()) {
ps.parse_line(line);
}
} catch (std::runtime_error const &e) {
ostd::writeln(e.what());
return 1;
} }
ostd::writeln("ALPHAS:");
ps.build(ps.alphas);
ostd::writeln("CONTROL:");
ps.build(ps.controls);
ostd::writeln("DIGITS:");
ps.build(ps.digits);
ostd::writeln("LOWERCASE:");
ps.build(ps.lowers, ps.touppers);
ostd::writeln("SPACES:");
ps.build(ps.spaces);
ostd::writeln("TITLES:");
ps.build(ps.titles);
ostd::writeln("UPPERCASE:");
ps.build(ps.uppers, ps.tolowers);
} }