implement all the unicode ctype funcs, generate the tables

master
Daniel Kolesa 2018-01-02 22:23:03 +01:00
parent 2b291bca39
commit a0337c401e
5 changed files with 380 additions and 45 deletions

1
.gitignore vendored
View File

@ -16,3 +16,4 @@ examples/stream1
examples/stream2
test_runner
doc/output
src/string_utf.hh

View File

@ -29,6 +29,9 @@ namespace fs = ostd::filesystem;
#include "src/process.cc"
#include "src/filesystem.cc"
#define OSTD_GEN_UNICODE_INCLUDE
#include "gen_unicode.cc"
using strvec = std::vector<std::string>;
using pathvec = std::vector<fs::path>;
@ -50,6 +53,9 @@ static pathvec TEST_CASES = {
"algorithm", "range"
};
static fs::path OSTD_UNICODE_DATA = "data/UnicodeData-10.0.txt";
static fs::path OSTD_UNICODE_SRC = CXX_SOURCE_DIR / "string_utf.hh";
static fs::path OSTD_SHARED_LIB = "libostd.so";
static fs::path OSTD_STATIC_LIB = "libostd.a";
@ -261,6 +267,7 @@ int main(int argc, char **argv) {
rp.replace_filename(cso.string() + "_dyn.o");
try_remove(rp);
}
try_remove(OSTD_UNICODE_SRC);
try_remove(OSTD_STATIC_LIB);
try_remove(OSTD_SHARED_LIB);
try_remove("test_runner.o");
@ -495,6 +502,11 @@ int main(int argc, char **argv) {
}
};
echo_q("Generating Unicode tables...");
ostd::unicode_gen::parse_state{}.build_all_from_file(
OSTD_UNICODE_DATA.string(), OSTD_UNICODE_SRC.string()
);
echo_q("Building the library...");
build_all(ASM_SOURCES, ASM_SOURCE_DIR, ".S", call_as);
build_all(cxx_sources.get(), fs::path{}, ".cc", call_cxx);

View File

@ -9,20 +9,23 @@
#include <cstdint>
#include <cctype>
#include <ctime>
#include <vector>
#include <array>
#include <stdexcept>
#include <initializer_list>
#include <ostd/io.hh>
#include <ostd/string.hh>
#include <ostd/algorithm.hh>
using ostd::string_range;
namespace ostd {
namespace unicode_gen {
using code_t = std::uint32_t;
using code_vec = std::vector<code_t>;
code_t hex_to_code(string_range hs) {
inline code_t hex_to_code(string_range hs) {
code_t ret = 0;
for (char c: hs) {
if (!std::isxdigit(c |= 32)) {
@ -65,7 +68,6 @@ struct parse_state {
}
assert_line(!bits[0].empty() && (bits[2].size() == 2));
code_t n = hex_to_code(bits[0]);
/* control chars */
if (bits[2] == "Cc") {
controls.push_back(n);
@ -109,7 +111,11 @@ struct parse_state {
/* good enough for now, ignore the rest */
}
void build(code_vec const &codes, code_vec const &cases = code_vec{}) {
template<typename R>
void build(
R &writer, string_range name,
code_vec const &codes, code_vec const &cases = code_vec{}
) {
code_vec singles;
code_vec singles_cases;
code_vec ranges_beg;
@ -134,14 +140,18 @@ struct parse_state {
};
auto match_lace = [
&codes, &cases, &match_pair
](std::size_t i, int off) {
](std::size_t i, std::size_t offs) {
int off = (!int(offs) * 2) - 1;
return match_pair(i, 2) && (cases.empty() || (
(cases[i + 1] == (codes[i + 1] + off)) &&
(cases[i ] == (codes[i ] + off))
));
};
for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) {
bool endseq = false;
std::size_t i = 0;
std::size_t ncodes = codes.size();
while (i < ncodes) {
if (match_range(i)) {
ranges_beg.push_back(codes[i]);
if (!cases.empty()) {
@ -153,42 +163,54 @@ struct parse_state {
}
/* end of range, try others */
ranges_end.push_back(codes[i]);
endseq = true;
continue;
}
if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) {
if (size_t j = 0; match_lace(i, j) || match_lace(i, ++j)) {
laces_beg[j].push_back(codes[i]);
for (++i; match_lace(i, j); ++i) {
continue;
}
laces_end[j].push_back(codes[i]);
endseq = true;
continue;
}
if (!endseq) {
singles.push_back(codes[i]);
if (!cases.empty()) {
singles_cases.push_back(cases[i]);
}
}
endseq = false;
++i;
}
auto build_list = [](
string_range name, std::size_t ncol,
auto build_list = [&writer, &name](
string_range aname, std::size_t ncol,
code_vec const &col1, code_vec const &col2, code_vec const &col3
) {
if (col1.empty()) {
return;
}
ostd::writefln("%s:", name);
for (std::size_t i = 0; i < col1.size(); ++i) {
format(
writer, "static char32_t %s_%s[][%d] = {\n",
name, aname, ncol
);
for (std::size_t j = 0; j < col1.size(); ++j) {
switch (ncol) {
case 1:
ostd::writefln(" 0x%06X", col1[i]);
format(writer, " { 0x%06X },\n", col1[j]);
break;
case 2:
ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]);
format(
writer, " { 0x%06X, 0x%06X },\n",
col1[j], col2[j]
);
break;
case 3:
ostd::writefln(
" 0x%06X, 0x%06X, 0x%06X",
col1[i], col2[i], col3[i]
format(
writer, " { 0x%06X, 0x%06X, 0x%06X },\n",
col1[j], col2[j], col3[j]
);
break;
default:
@ -196,8 +218,15 @@ struct parse_state {
break;
}
}
format(writer, "};\n\n");
};
if (cases.empty()) {
format(writer, "\n/* is%s */\n\n", name);
} else {
format(writer, "\n/* is%s, to%s */\n\n", name, name);
}
build_list(
"ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases
);
@ -206,45 +235,132 @@ struct parse_state {
build_list(
"singles", !cases.empty() + 1, singles, singles_cases, singles
);
/* is_CTYPE(c) */
build_func(
writer, name, "is", "bool",
ranges_beg, laces_beg[0], laces_beg[1], singles
);
/* to_CTYPE(c) */
if (!cases.empty()) {
build_func(
writer, name, "to", "char32_t",
ranges_beg, laces_beg[0], laces_beg[1], singles
);
}
}
template<typename R>
void build_header(R &writer) {
char buf[64];
time_t curt;
std::time(&curt);
strftime(buf, sizeof(buf), "%c", std::localtime(&curt));
format(
writer, "/* Generated on %s by gen_unicode (libostd) */\n",
static_cast<char *>(buf)
);
}
template<typename R>
void build_func(
R &writer,
string_range name,
string_range prefix,
string_range ret_type,
code_vec const &ranges,
code_vec const &laces1,
code_vec const &laces2,
code_vec const &singles
) {
format(
writer, "%s %s%s(char32_t c) {\n", ret_type, prefix, name
);
format(writer, " return utf::uctype_func<\n");
auto it1 = { &ranges, &laces1, &laces2, &singles };
auto it2 = { "ranges", "laces1", "laces2", "singles" };
for (std::size_t i = 0; i < it1.size(); ++i) {
if (it1.begin()[i]->empty()) {
format(writer, " 0, 0");
} else {
format(
writer, " sizeof(%s_%s), sizeof(*%s_%s)",
name, it2.begin()[i], name, it2.begin()[i]
);
}
if ((i + 1) != it1.size()) {
format(writer, ",\n");
} else {
format(writer, "\n");
}
}
format(writer, " >::do_%s(\n c, ", prefix);
for (std::size_t i = 0; i < it1.size(); ++i) {
if (i != 0) {
format(writer, ", ");
}
if (it1.begin()[i]->empty()) {
format(writer, "nullptr");
} else {
format(writer, "%s_%s", name, it2.begin()[i]);
}
}
format(writer, "\n );\n}\n\n");
}
template<typename R, typename IR>
void build_all(R &writer, IR lines) {
for (auto const &line: lines) {
parse_line(line);
}
build_header(writer);
build(writer, "alpha", alphas);
build(writer, "cntrl", controls);
build(writer, "digit", digits);
build(writer, "lower", lowers, touppers);
build(writer, "space", spaces);
build(writer, "title", titles);
build(writer, "upper", uppers, tolowers);
}
void build_all_from_file(string_range input, string_range output) {
file_stream ifs{input};
if (!ifs.is_open()) {
throw std::runtime_error{"could not open input file"};
}
file_stream ofs{output, stream_mode::WRITE};
if (!ofs.is_open()) {
throw std::runtime_error{"could not open output file"};
}
auto writer = ofs.iter();
build_all(writer, ifs.iter_lines());
}
};
} /* namespace unicode_gen */
} /* namespace ostd */
#ifndef OSTD_GEN_UNICODE_INCLUDE
int main(int argc, char **argv) {
if (argc <= 1) {
ostd::writeln("not enough arguments");
return 1;
}
string_range fname = argv[1];
ostd::file_stream f{fname};
if (!f.is_open()) {
ostd::writefln("cannot open file '%s'", fname);
return 1;
ostd::string_range fname = argv[1];
ostd::string_range outname = "src/string_utf.hh";
if (argc >= 3) {
outname = argv[2];
}
parse_state ps;
ostd::unicode_gen::parse_state ps;
try {
for (auto const &line: f.iter_lines()) {
ps.parse_line(line);
}
ps.build_all_from_file(fname, outname);
} catch (std::runtime_error const &e) {
ostd::writeln(e.what());
return 1;
}
ostd::writeln("ALPHAS:");
ps.build(ps.alphas);
ostd::writeln("CONTROL:");
ps.build(ps.controls);
ostd::writeln("DIGITS:");
ps.build(ps.digits);
ostd::writeln("LOWERCASE:");
ps.build(ps.lowers, ps.touppers);
ostd::writeln("SPACES:");
ps.build(ps.spaces);
ostd::writeln("TITLES:");
ps.build(ps.titles);
ostd::writeln("UPPERCASE:");
ps.build(ps.uppers, ps.tolowers);
}
#endif

View File

@ -888,6 +888,23 @@ namespace utf {
return detail::codepoint_range<char32_t>{r};
}
bool isalnum(char32_t c);
bool isalpha(char32_t c);
bool isblank(char32_t c);
bool iscntrl(char32_t c);
bool isdigit(char32_t c);
bool isgraph(char32_t c);
bool islower(char32_t c);
bool isprint(char32_t c);
bool ispunct(char32_t c);
bool isspace(char32_t c);
bool istitle(char32_t c);
bool isupper(char32_t c);
bool isvalid(char32_t c);
bool isxdigit(char32_t c);
char32_t tolower(char32_t c);
char32_t toupper(char32_t c);
/** @} */
} /* namespace utf */

View File

@ -4,6 +4,7 @@
*/
#include <cstdint>
#include <cstdlib>
#include "ostd/string.hh"
namespace ostd {
@ -114,5 +115,193 @@ std::size_t length(string_range r, string_range &cont) noexcept {
return ret;
}
/* unicode-aware ctype
* the other ones use custom tables for lookups
*/
bool isalnum(char32_t c) {
return (utf::isalpha(c) || utf::isdigit(c));
}
bool isblank(char32_t c) {
return ((c == ' ') || (c == '\t'));
}
bool isgraph(char32_t c) {
return (!utf::isspace(c) && utf::isprint(c));
}
bool isprint(char32_t c) {
switch (std::uint32_t(c)) {
case 0x2028:
case 0x2029:
case 0xFFF9:
case 0xFFFA:
case 0xFFFB:
return false;
default:
return !utf::iscntrl(c);
}
}
bool ispunct(char32_t c) {
return (utf::isgraph(c) && !utf::isalnum(c));
}
bool isvalid(char32_t c) {
/* surrogate code points */
if ((c >= 0xD800) && (c <= 0xDFFF)) {
return false;
}
/* non-characters */
if ((c >= 0xFDD0) && (c <= 0xFDEF)) {
return false;
}
/* end of plane */
if ((c & 0xFFFE) == 0xFFFE) {
return false;
}
/* must be within range */
return (c <= MaxCodepoint);
}
bool isxdigit(char32_t c) {
if ((c >= '0') && (c <= '9')) {
return true;
}
auto uc = std::uint32_t(c) | 32;
return ((uc >= 'a') && (uc <= 'f'));
}
inline int codepoint_cmp1(void const *a, void const *b) {
char32_t c1 = *static_cast<char32_t const *>(a);
char32_t c2 = *static_cast<char32_t const *>(b);
return (c1 - c2);
}
inline int codepoint_cmp2(void const *a, void const *b) {
char32_t c = *static_cast<char32_t const *>(a);
char32_t const *p = static_cast<char32_t const *>(b);
if ((c >= p[0]) && (c <= p[1])) {
return 0;
}
return (c - p[0]);
}
template<
std::size_t RangesN, std::size_t RangesS,
std::size_t Laces1N, std::size_t Laces1S,
std::size_t Laces2N, std::size_t Laces2S,
std::size_t SinglesN, std::size_t SinglesS
>
struct uctype_func {
static bool do_is(
char32_t c,
void const *ranges [[maybe_unused]],
void const *laces1 [[maybe_unused]],
void const *laces2 [[maybe_unused]],
void const *singles [[maybe_unused]]
) {
if constexpr(RangesN != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, ranges, RangesN / RangesS, RangesS / sizeof(char32_t),
codepoint_cmp2
));
if (found) {
return true;
}
}
if constexpr(Laces1N != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, laces1, Laces1N / Laces1S, Laces1S / sizeof(char32_t),
codepoint_cmp2
));
if (found) {
return !((c - found[0]) % 2);
}
}
if constexpr(Laces2N != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, laces2, Laces2N / Laces2S, Laces2S / sizeof(char32_t),
codepoint_cmp2
));
if (found) {
return !((c - found[0]) % 2);
}
}
if constexpr(SinglesN != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, singles, SinglesN / SinglesS, SinglesS / sizeof(char32_t),
codepoint_cmp1
));
if (found) {
return true;
}
}
return false;
}
static char32_t do_to(
char32_t c,
void const *ranges [[maybe_unused]],
void const *laces1 [[maybe_unused]],
void const *laces2 [[maybe_unused]],
void const *singles [[maybe_unused]]
) {
if constexpr(RangesN != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, ranges, RangesN >> 4, RangesN & 0xF, codepoint_cmp2
));
if (found) {
return (found[2] + (c - found[0]));
}
}
if constexpr(Laces1N != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, laces1, Laces1N >> 4, Laces1N & 0xF, codepoint_cmp2
));
if (found) {
if ((c - found[0]) % 2) {
return c;
}
return c + 1;
}
}
if constexpr(Laces2N != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, laces2, Laces2N >> 4, Laces2N & 0xF, codepoint_cmp2
));
if (found) {
if ((c - found[0]) % 2) {
return c;
}
return c - 1;
}
}
if constexpr(SinglesN != 0) {
char32_t *found = static_cast<char32_t *>(std::bsearch(
&c, singles, SinglesN >> 4, SinglesN & 0xF, codepoint_cmp1
));
if (found) {
return found[1];
}
}
return c;
}
};
/* these are geneated */
bool isalpha(char32_t c);
bool iscntrl(char32_t c);
bool isdigit(char32_t c);
bool islower(char32_t c);
bool isspace(char32_t c);
bool istitle(char32_t c);
bool isupper(char32_t c);
char32_t tolower(char32_t c);
char32_t toupper(char32_t c);
#include "string_utf.hh"
} /* namespace utf */
} /* namespace ostd */