implement the sorting logic in unicode generator

master
Daniel Kolesa 2018-01-02 19:03:39 +01:00
parent 80aadd906e
commit 2b291bca39
3 changed files with 149 additions and 7 deletions

View File

@ -32,6 +32,12 @@ SOFTWARE.**
Additionally some code from the libc++ project has been used as a reference;
libc++ is a part of the LLVM project.
The libutf project (https://github.com/cls/libutf) was used as an inspiration
for some of the Unicode implementation bits within libostd and the gen\_unicode
program used to generate the necessary ctype tables, but all of the code was
written from scratch; nevertheless, I would like to give some credit and thanks
to the project here.
Additional thanks to Dale Weiler aka graphitemaster (reference code in the
Neothyne project) and cppreference.com.

View File

@ -1,4 +1,9 @@
/* A build system for libostd. */
/* A simple build system to build libostd.
*
* This file is a part of the libostd project. Libostd is licensed under the
* University of Illinois/NCSA Open Source License, as is this file. See the
* COPYING.md file further information.
*/
/* for Windows so that we avoid dllimport/dllexport */
#define OSTD_BUILD_LIB

View File

@ -1,16 +1,28 @@
/* This simple program generates the tables necessary for Unicode character
* types. It's inspired by the mkrunetype.awk generator from the libutf
* project, see COPYING.md.
*
* This file is a part of the libostd project. Libostd is licensed under the
* University of Illinois/NCSA Open Source License, as is this file. See the
* COPYING.md file further information.
*/
#include <cstdint>
#include <cctype>
#include <vector>
#include <array>
#include <stdexcept>
#include <ostd/io.hh>
#include <ostd/string.hh>
#include <ostd/algorithm.hh>
using ostd::string_range;
using code_t = std::uint32_t;
using code_vec = std::vector<code_t>;
code_t hex_to_code(ostd::string_range hs) {
code_t hex_to_code(string_range hs) {
code_t ret = 0;
for (char c: hs) {
if (!std::isxdigit(c |= 32)) {
@ -38,8 +50,8 @@ struct parse_state {
}
}
void parse_line(ostd::string_range line) {
std::array<ostd::string_range, 15> bits;
void parse_line(string_range line) {
std::array<string_range, 15> bits;
for (std::size_t n = 0;;) {
auto sc = ostd::find(line, ';');
if (!sc) {
@ -96,6 +108,105 @@ struct parse_state {
}
/* good enough for now, ignore the rest */
}
void build(code_vec const &codes, code_vec const &cases = code_vec{}) {
code_vec singles;
code_vec singles_cases;
code_vec ranges_beg;
code_vec ranges_end;
code_vec ranges_cases;
code_vec laces_beg[2];
code_vec laces_end[2];
if (!cases.empty() && (cases.size() != codes.size())) {
throw std::runtime_error{"mismatched code lists"};
}
auto match_pair = [&codes](std::size_t i, std::size_t off) {
return (
((i + 1) < codes.size()) && (codes[i + 1] == (codes[i] + off))
);
};
auto match_range = [&codes, &cases, &match_pair](std::size_t i) {
return match_pair(i, 1) && (
cases.empty() || (cases[i + 1] == (cases[i] + 1))
);
};
auto match_lace = [
&codes, &cases, &match_pair
](std::size_t i, int off) {
return match_pair(i, 2) && (cases.empty() || (
(cases[i + 1] == (codes[i + 1] + off)) &&
(cases[i ] == (codes[i ] + off))
));
};
for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) {
if (match_range(i)) {
ranges_beg.push_back(codes[i]);
if (!cases.empty()) {
ranges_cases.push_back(cases[i]);
}
/* go to the end of sequence */
for (++i; match_range(i); ++i) {
continue;
}
/* end of range, try others */
ranges_end.push_back(codes[i]);
continue;
}
if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) {
laces_beg[j].push_back(codes[i]);
for (++i; match_lace(i, j); ++i) {
continue;
}
laces_end[j].push_back(codes[i]);
continue;
}
singles.push_back(codes[i]);
if (!cases.empty()) {
singles_cases.push_back(cases[i]);
}
}
auto build_list = [](
string_range name, std::size_t ncol,
code_vec const &col1, code_vec const &col2, code_vec const &col3
) {
if (col1.empty()) {
return;
}
ostd::writefln("%s:", name);
for (std::size_t i = 0; i < col1.size(); ++i) {
switch (ncol) {
case 1:
ostd::writefln(" 0x%06X", col1[i]);
break;
case 2:
ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]);
break;
case 3:
ostd::writefln(
" 0x%06X, 0x%06X, 0x%06X",
col1[i], col2[i], col3[i]
);
break;
default:
throw std::runtime_error{"invalid column number"};
break;
}
}
};
build_list(
"ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases
);
build_list("laces1", 2, laces_beg[0], laces_end[0], laces_beg[0]);
build_list("laces2", 2, laces_beg[1], laces_end[1], laces_beg[1]);
build_list(
"singles", !cases.empty() + 1, singles, singles_cases, singles
);
}
};
int main(int argc, char **argv) {
@ -104,7 +215,7 @@ int main(int argc, char **argv) {
return 1;
}
ostd::string_range fname = argv[1];
string_range fname = argv[1];
ostd::file_stream f{fname};
if (!f.is_open()) {
ostd::writefln("cannot open file '%s'", fname);
@ -113,7 +224,27 @@ int main(int argc, char **argv) {
parse_state ps;
for (auto const &line: f.iter_lines()) {
ps.parse_line(line);
try {
for (auto const &line: f.iter_lines()) {
ps.parse_line(line);
}
} catch (std::runtime_error const &e) {
ostd::writeln(e.what());
return 1;
}
ostd::writeln("ALPHAS:");
ps.build(ps.alphas);
ostd::writeln("CONTROL:");
ps.build(ps.controls);
ostd::writeln("DIGITS:");
ps.build(ps.digits);
ostd::writeln("LOWERCASE:");
ps.build(ps.lowers, ps.touppers);
ostd::writeln("SPACES:");
ps.build(ps.spaces);
ostd::writeln("TITLES:");
ps.build(ps.titles);
ostd::writeln("UPPERCASE:");
ps.build(ps.uppers, ps.tolowers);
}