forked from OctaForge/libostd
implement all the unicode ctype funcs, generate the tables
This commit is contained in:
parent
2b291bca39
commit
a0337c401e
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -16,3 +16,4 @@ examples/stream1
|
|||
examples/stream2
|
||||
test_runner
|
||||
doc/output
|
||||
src/string_utf.hh
|
||||
|
|
12
build.cc
12
build.cc
|
@ -29,6 +29,9 @@ namespace fs = ostd::filesystem;
|
|||
#include "src/process.cc"
|
||||
#include "src/filesystem.cc"
|
||||
|
||||
#define OSTD_GEN_UNICODE_INCLUDE
|
||||
#include "gen_unicode.cc"
|
||||
|
||||
using strvec = std::vector<std::string>;
|
||||
using pathvec = std::vector<fs::path>;
|
||||
|
||||
|
@ -50,6 +53,9 @@ static pathvec TEST_CASES = {
|
|||
"algorithm", "range"
|
||||
};
|
||||
|
||||
static fs::path OSTD_UNICODE_DATA = "data/UnicodeData-10.0.txt";
|
||||
static fs::path OSTD_UNICODE_SRC = CXX_SOURCE_DIR / "string_utf.hh";
|
||||
|
||||
static fs::path OSTD_SHARED_LIB = "libostd.so";
|
||||
static fs::path OSTD_STATIC_LIB = "libostd.a";
|
||||
|
||||
|
@ -261,6 +267,7 @@ int main(int argc, char **argv) {
|
|||
rp.replace_filename(cso.string() + "_dyn.o");
|
||||
try_remove(rp);
|
||||
}
|
||||
try_remove(OSTD_UNICODE_SRC);
|
||||
try_remove(OSTD_STATIC_LIB);
|
||||
try_remove(OSTD_SHARED_LIB);
|
||||
try_remove("test_runner.o");
|
||||
|
@ -495,6 +502,11 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
};
|
||||
|
||||
echo_q("Generating Unicode tables...");
|
||||
ostd::unicode_gen::parse_state{}.build_all_from_file(
|
||||
OSTD_UNICODE_DATA.string(), OSTD_UNICODE_SRC.string()
|
||||
);
|
||||
|
||||
echo_q("Building the library...");
|
||||
build_all(ASM_SOURCES, ASM_SOURCE_DIR, ".S", call_as);
|
||||
build_all(cxx_sources.get(), fs::path{}, ".cc", call_cxx);
|
||||
|
|
206
gen_unicode.cc
206
gen_unicode.cc
|
@ -9,20 +9,23 @@
|
|||
|
||||
#include <cstdint>
|
||||
#include <cctype>
|
||||
#include <ctime>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <stdexcept>
|
||||
#include <initializer_list>
|
||||
|
||||
#include <ostd/io.hh>
|
||||
#include <ostd/string.hh>
|
||||
#include <ostd/algorithm.hh>
|
||||
|
||||
using ostd::string_range;
|
||||
namespace ostd {
|
||||
namespace unicode_gen {
|
||||
|
||||
using code_t = std::uint32_t;
|
||||
using code_vec = std::vector<code_t>;
|
||||
|
||||
code_t hex_to_code(string_range hs) {
|
||||
inline code_t hex_to_code(string_range hs) {
|
||||
code_t ret = 0;
|
||||
for (char c: hs) {
|
||||
if (!std::isxdigit(c |= 32)) {
|
||||
|
@ -65,7 +68,6 @@ struct parse_state {
|
|||
}
|
||||
assert_line(!bits[0].empty() && (bits[2].size() == 2));
|
||||
code_t n = hex_to_code(bits[0]);
|
||||
|
||||
/* control chars */
|
||||
if (bits[2] == "Cc") {
|
||||
controls.push_back(n);
|
||||
|
@ -109,7 +111,11 @@ struct parse_state {
|
|||
/* good enough for now, ignore the rest */
|
||||
}
|
||||
|
||||
void build(code_vec const &codes, code_vec const &cases = code_vec{}) {
|
||||
template<typename R>
|
||||
void build(
|
||||
R &writer, string_range name,
|
||||
code_vec const &codes, code_vec const &cases = code_vec{}
|
||||
) {
|
||||
code_vec singles;
|
||||
code_vec singles_cases;
|
||||
code_vec ranges_beg;
|
||||
|
@ -134,14 +140,18 @@ struct parse_state {
|
|||
};
|
||||
auto match_lace = [
|
||||
&codes, &cases, &match_pair
|
||||
](std::size_t i, int off) {
|
||||
](std::size_t i, std::size_t offs) {
|
||||
int off = (!int(offs) * 2) - 1;
|
||||
return match_pair(i, 2) && (cases.empty() || (
|
||||
(cases[i + 1] == (codes[i + 1] + off)) &&
|
||||
(cases[i ] == (codes[i ] + off))
|
||||
));
|
||||
};
|
||||
|
||||
for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) {
|
||||
bool endseq = false;
|
||||
std::size_t i = 0;
|
||||
std::size_t ncodes = codes.size();
|
||||
while (i < ncodes) {
|
||||
if (match_range(i)) {
|
||||
ranges_beg.push_back(codes[i]);
|
||||
if (!cases.empty()) {
|
||||
|
@ -153,42 +163,54 @@ struct parse_state {
|
|||
}
|
||||
/* end of range, try others */
|
||||
ranges_end.push_back(codes[i]);
|
||||
endseq = true;
|
||||
continue;
|
||||
}
|
||||
if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) {
|
||||
if (size_t j = 0; match_lace(i, j) || match_lace(i, ++j)) {
|
||||
laces_beg[j].push_back(codes[i]);
|
||||
for (++i; match_lace(i, j); ++i) {
|
||||
continue;
|
||||
}
|
||||
laces_end[j].push_back(codes[i]);
|
||||
endseq = true;
|
||||
continue;
|
||||
}
|
||||
singles.push_back(codes[i]);
|
||||
if (!cases.empty()) {
|
||||
singles_cases.push_back(cases[i]);
|
||||
if (!endseq) {
|
||||
singles.push_back(codes[i]);
|
||||
if (!cases.empty()) {
|
||||
singles_cases.push_back(cases[i]);
|
||||
}
|
||||
}
|
||||
endseq = false;
|
||||
++i;
|
||||
}
|
||||
|
||||
auto build_list = [](
|
||||
string_range name, std::size_t ncol,
|
||||
auto build_list = [&writer, &name](
|
||||
string_range aname, std::size_t ncol,
|
||||
code_vec const &col1, code_vec const &col2, code_vec const &col3
|
||||
) {
|
||||
if (col1.empty()) {
|
||||
return;
|
||||
}
|
||||
ostd::writefln("%s:", name);
|
||||
for (std::size_t i = 0; i < col1.size(); ++i) {
|
||||
format(
|
||||
writer, "static char32_t %s_%s[][%d] = {\n",
|
||||
name, aname, ncol
|
||||
);
|
||||
for (std::size_t j = 0; j < col1.size(); ++j) {
|
||||
switch (ncol) {
|
||||
case 1:
|
||||
ostd::writefln(" 0x%06X", col1[i]);
|
||||
format(writer, " { 0x%06X },\n", col1[j]);
|
||||
break;
|
||||
case 2:
|
||||
ostd::writefln(" 0x%06X, 0x%06X", col1[i], col2[i]);
|
||||
format(
|
||||
writer, " { 0x%06X, 0x%06X },\n",
|
||||
col1[j], col2[j]
|
||||
);
|
||||
break;
|
||||
case 3:
|
||||
ostd::writefln(
|
||||
" 0x%06X, 0x%06X, 0x%06X",
|
||||
col1[i], col2[i], col3[i]
|
||||
format(
|
||||
writer, " { 0x%06X, 0x%06X, 0x%06X },\n",
|
||||
col1[j], col2[j], col3[j]
|
||||
);
|
||||
break;
|
||||
default:
|
||||
|
@ -196,8 +218,15 @@ struct parse_state {
|
|||
break;
|
||||
}
|
||||
}
|
||||
format(writer, "};\n\n");
|
||||
};
|
||||
|
||||
if (cases.empty()) {
|
||||
format(writer, "\n/* is%s */\n\n", name);
|
||||
} else {
|
||||
format(writer, "\n/* is%s, to%s */\n\n", name, name);
|
||||
}
|
||||
|
||||
build_list(
|
||||
"ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases
|
||||
);
|
||||
|
@ -206,45 +235,132 @@ struct parse_state {
|
|||
build_list(
|
||||
"singles", !cases.empty() + 1, singles, singles_cases, singles
|
||||
);
|
||||
|
||||
/* is_CTYPE(c) */
|
||||
build_func(
|
||||
writer, name, "is", "bool",
|
||||
ranges_beg, laces_beg[0], laces_beg[1], singles
|
||||
);
|
||||
|
||||
/* to_CTYPE(c) */
|
||||
if (!cases.empty()) {
|
||||
build_func(
|
||||
writer, name, "to", "char32_t",
|
||||
ranges_beg, laces_beg[0], laces_beg[1], singles
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename R>
|
||||
void build_header(R &writer) {
|
||||
char buf[64];
|
||||
time_t curt;
|
||||
std::time(&curt);
|
||||
strftime(buf, sizeof(buf), "%c", std::localtime(&curt));
|
||||
format(
|
||||
writer, "/* Generated on %s by gen_unicode (libostd) */\n",
|
||||
static_cast<char *>(buf)
|
||||
);
|
||||
}
|
||||
|
||||
template<typename R>
|
||||
void build_func(
|
||||
R &writer,
|
||||
string_range name,
|
||||
string_range prefix,
|
||||
string_range ret_type,
|
||||
code_vec const &ranges,
|
||||
code_vec const &laces1,
|
||||
code_vec const &laces2,
|
||||
code_vec const &singles
|
||||
) {
|
||||
format(
|
||||
writer, "%s %s%s(char32_t c) {\n", ret_type, prefix, name
|
||||
);
|
||||
format(writer, " return utf::uctype_func<\n");
|
||||
auto it1 = { &ranges, &laces1, &laces2, &singles };
|
||||
auto it2 = { "ranges", "laces1", "laces2", "singles" };
|
||||
for (std::size_t i = 0; i < it1.size(); ++i) {
|
||||
if (it1.begin()[i]->empty()) {
|
||||
format(writer, " 0, 0");
|
||||
} else {
|
||||
format(
|
||||
writer, " sizeof(%s_%s), sizeof(*%s_%s)",
|
||||
name, it2.begin()[i], name, it2.begin()[i]
|
||||
);
|
||||
}
|
||||
if ((i + 1) != it1.size()) {
|
||||
format(writer, ",\n");
|
||||
} else {
|
||||
format(writer, "\n");
|
||||
}
|
||||
}
|
||||
format(writer, " >::do_%s(\n c, ", prefix);
|
||||
for (std::size_t i = 0; i < it1.size(); ++i) {
|
||||
if (i != 0) {
|
||||
format(writer, ", ");
|
||||
}
|
||||
if (it1.begin()[i]->empty()) {
|
||||
format(writer, "nullptr");
|
||||
} else {
|
||||
format(writer, "%s_%s", name, it2.begin()[i]);
|
||||
}
|
||||
}
|
||||
format(writer, "\n );\n}\n\n");
|
||||
}
|
||||
|
||||
template<typename R, typename IR>
|
||||
void build_all(R &writer, IR lines) {
|
||||
for (auto const &line: lines) {
|
||||
parse_line(line);
|
||||
}
|
||||
|
||||
build_header(writer);
|
||||
|
||||
build(writer, "alpha", alphas);
|
||||
build(writer, "cntrl", controls);
|
||||
build(writer, "digit", digits);
|
||||
build(writer, "lower", lowers, touppers);
|
||||
build(writer, "space", spaces);
|
||||
build(writer, "title", titles);
|
||||
build(writer, "upper", uppers, tolowers);
|
||||
}
|
||||
|
||||
void build_all_from_file(string_range input, string_range output) {
|
||||
file_stream ifs{input};
|
||||
if (!ifs.is_open()) {
|
||||
throw std::runtime_error{"could not open input file"};
|
||||
}
|
||||
file_stream ofs{output, stream_mode::WRITE};
|
||||
if (!ofs.is_open()) {
|
||||
throw std::runtime_error{"could not open output file"};
|
||||
}
|
||||
auto writer = ofs.iter();
|
||||
build_all(writer, ifs.iter_lines());
|
||||
}
|
||||
};
|
||||
|
||||
} /* namespace unicode_gen */
|
||||
} /* namespace ostd */
|
||||
|
||||
#ifndef OSTD_GEN_UNICODE_INCLUDE
|
||||
int main(int argc, char **argv) {
|
||||
if (argc <= 1) {
|
||||
ostd::writeln("not enough arguments");
|
||||
return 1;
|
||||
}
|
||||
|
||||
string_range fname = argv[1];
|
||||
ostd::file_stream f{fname};
|
||||
if (!f.is_open()) {
|
||||
ostd::writefln("cannot open file '%s'", fname);
|
||||
return 1;
|
||||
ostd::string_range fname = argv[1];
|
||||
ostd::string_range outname = "src/string_utf.hh";
|
||||
if (argc >= 3) {
|
||||
outname = argv[2];
|
||||
}
|
||||
|
||||
parse_state ps;
|
||||
|
||||
ostd::unicode_gen::parse_state ps;
|
||||
try {
|
||||
for (auto const &line: f.iter_lines()) {
|
||||
ps.parse_line(line);
|
||||
}
|
||||
ps.build_all_from_file(fname, outname);
|
||||
} catch (std::runtime_error const &e) {
|
||||
ostd::writeln(e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ostd::writeln("ALPHAS:");
|
||||
ps.build(ps.alphas);
|
||||
ostd::writeln("CONTROL:");
|
||||
ps.build(ps.controls);
|
||||
ostd::writeln("DIGITS:");
|
||||
ps.build(ps.digits);
|
||||
ostd::writeln("LOWERCASE:");
|
||||
ps.build(ps.lowers, ps.touppers);
|
||||
ostd::writeln("SPACES:");
|
||||
ps.build(ps.spaces);
|
||||
ostd::writeln("TITLES:");
|
||||
ps.build(ps.titles);
|
||||
ostd::writeln("UPPERCASE:");
|
||||
ps.build(ps.uppers, ps.tolowers);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -888,6 +888,23 @@ namespace utf {
|
|||
return detail::codepoint_range<char32_t>{r};
|
||||
}
|
||||
|
||||
bool isalnum(char32_t c);
|
||||
bool isalpha(char32_t c);
|
||||
bool isblank(char32_t c);
|
||||
bool iscntrl(char32_t c);
|
||||
bool isdigit(char32_t c);
|
||||
bool isgraph(char32_t c);
|
||||
bool islower(char32_t c);
|
||||
bool isprint(char32_t c);
|
||||
bool ispunct(char32_t c);
|
||||
bool isspace(char32_t c);
|
||||
bool istitle(char32_t c);
|
||||
bool isupper(char32_t c);
|
||||
bool isvalid(char32_t c);
|
||||
bool isxdigit(char32_t c);
|
||||
char32_t tolower(char32_t c);
|
||||
char32_t toupper(char32_t c);
|
||||
|
||||
/** @} */
|
||||
|
||||
} /* namespace utf */
|
||||
|
|
189
src/string.cc
189
src/string.cc
|
@ -4,6 +4,7 @@
|
|||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include "ostd/string.hh"
|
||||
|
||||
namespace ostd {
|
||||
|
@ -114,5 +115,193 @@ std::size_t length(string_range r, string_range &cont) noexcept {
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* unicode-aware ctype
|
||||
* the other ones use custom tables for lookups
|
||||
*/
|
||||
|
||||
bool isalnum(char32_t c) {
|
||||
return (utf::isalpha(c) || utf::isdigit(c));
|
||||
}
|
||||
|
||||
bool isblank(char32_t c) {
|
||||
return ((c == ' ') || (c == '\t'));
|
||||
}
|
||||
|
||||
bool isgraph(char32_t c) {
|
||||
return (!utf::isspace(c) && utf::isprint(c));
|
||||
}
|
||||
|
||||
bool isprint(char32_t c) {
|
||||
switch (std::uint32_t(c)) {
|
||||
case 0x2028:
|
||||
case 0x2029:
|
||||
case 0xFFF9:
|
||||
case 0xFFFA:
|
||||
case 0xFFFB:
|
||||
return false;
|
||||
default:
|
||||
return !utf::iscntrl(c);
|
||||
}
|
||||
}
|
||||
|
||||
bool ispunct(char32_t c) {
|
||||
return (utf::isgraph(c) && !utf::isalnum(c));
|
||||
}
|
||||
|
||||
bool isvalid(char32_t c) {
|
||||
/* surrogate code points */
|
||||
if ((c >= 0xD800) && (c <= 0xDFFF)) {
|
||||
return false;
|
||||
}
|
||||
/* non-characters */
|
||||
if ((c >= 0xFDD0) && (c <= 0xFDEF)) {
|
||||
return false;
|
||||
}
|
||||
/* end of plane */
|
||||
if ((c & 0xFFFE) == 0xFFFE) {
|
||||
return false;
|
||||
}
|
||||
/* must be within range */
|
||||
return (c <= MaxCodepoint);
|
||||
}
|
||||
|
||||
bool isxdigit(char32_t c) {
|
||||
if ((c >= '0') && (c <= '9')) {
|
||||
return true;
|
||||
}
|
||||
auto uc = std::uint32_t(c) | 32;
|
||||
return ((uc >= 'a') && (uc <= 'f'));
|
||||
}
|
||||
|
||||
inline int codepoint_cmp1(void const *a, void const *b) {
|
||||
char32_t c1 = *static_cast<char32_t const *>(a);
|
||||
char32_t c2 = *static_cast<char32_t const *>(b);
|
||||
return (c1 - c2);
|
||||
}
|
||||
|
||||
inline int codepoint_cmp2(void const *a, void const *b) {
|
||||
char32_t c = *static_cast<char32_t const *>(a);
|
||||
char32_t const *p = static_cast<char32_t const *>(b);
|
||||
if ((c >= p[0]) && (c <= p[1])) {
|
||||
return 0;
|
||||
}
|
||||
return (c - p[0]);
|
||||
}
|
||||
|
||||
template<
|
||||
std::size_t RangesN, std::size_t RangesS,
|
||||
std::size_t Laces1N, std::size_t Laces1S,
|
||||
std::size_t Laces2N, std::size_t Laces2S,
|
||||
std::size_t SinglesN, std::size_t SinglesS
|
||||
>
|
||||
struct uctype_func {
|
||||
static bool do_is(
|
||||
char32_t c,
|
||||
void const *ranges [[maybe_unused]],
|
||||
void const *laces1 [[maybe_unused]],
|
||||
void const *laces2 [[maybe_unused]],
|
||||
void const *singles [[maybe_unused]]
|
||||
) {
|
||||
if constexpr(RangesN != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, ranges, RangesN / RangesS, RangesS / sizeof(char32_t),
|
||||
codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if constexpr(Laces1N != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, laces1, Laces1N / Laces1S, Laces1S / sizeof(char32_t),
|
||||
codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
return !((c - found[0]) % 2);
|
||||
}
|
||||
}
|
||||
if constexpr(Laces2N != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, laces2, Laces2N / Laces2S, Laces2S / sizeof(char32_t),
|
||||
codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
return !((c - found[0]) % 2);
|
||||
}
|
||||
}
|
||||
if constexpr(SinglesN != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, singles, SinglesN / SinglesS, SinglesS / sizeof(char32_t),
|
||||
codepoint_cmp1
|
||||
));
|
||||
if (found) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static char32_t do_to(
|
||||
char32_t c,
|
||||
void const *ranges [[maybe_unused]],
|
||||
void const *laces1 [[maybe_unused]],
|
||||
void const *laces2 [[maybe_unused]],
|
||||
void const *singles [[maybe_unused]]
|
||||
) {
|
||||
if constexpr(RangesN != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, ranges, RangesN >> 4, RangesN & 0xF, codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
return (found[2] + (c - found[0]));
|
||||
}
|
||||
}
|
||||
if constexpr(Laces1N != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, laces1, Laces1N >> 4, Laces1N & 0xF, codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
if ((c - found[0]) % 2) {
|
||||
return c;
|
||||
}
|
||||
return c + 1;
|
||||
}
|
||||
}
|
||||
if constexpr(Laces2N != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, laces2, Laces2N >> 4, Laces2N & 0xF, codepoint_cmp2
|
||||
));
|
||||
if (found) {
|
||||
if ((c - found[0]) % 2) {
|
||||
return c;
|
||||
}
|
||||
return c - 1;
|
||||
}
|
||||
}
|
||||
if constexpr(SinglesN != 0) {
|
||||
char32_t *found = static_cast<char32_t *>(std::bsearch(
|
||||
&c, singles, SinglesN >> 4, SinglesN & 0xF, codepoint_cmp1
|
||||
));
|
||||
if (found) {
|
||||
return found[1];
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
/* these are geneated */
|
||||
bool isalpha(char32_t c);
|
||||
bool iscntrl(char32_t c);
|
||||
bool isdigit(char32_t c);
|
||||
bool islower(char32_t c);
|
||||
bool isspace(char32_t c);
|
||||
bool istitle(char32_t c);
|
||||
bool isupper(char32_t c);
|
||||
char32_t tolower(char32_t c);
|
||||
char32_t toupper(char32_t c);
|
||||
|
||||
#include "string_utf.hh"
|
||||
|
||||
} /* namespace utf */
|
||||
} /* namespace ostd */
|
||||
|
|
Loading…
Reference in a new issue