From 2b291bca3912209299507d15bc48387a7eb921d5 Mon Sep 17 00:00:00 2001
From: q66 <daniel@octaforge.org>
Date: Tue, 2 Jan 2018 19:03:39 +0100
Subject: [PATCH] implement the sorting logic in unicode generator

---
 COPYING.md     |   6 +++
 build.cc       |   7 ++-
 gen_unicode.cc | 143 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 149 insertions(+), 7 deletions(-)
diff --git a/COPYING.md b/COPYING.md
index 6242ab8..1bad216 100644
--- a/COPYING.md
+++ b/COPYING.md
@@ -32,6 +32,12 @@ SOFTWARE.**
 Additionally some code from the libc++ project has been used as a reference;
 libc++ is a part of the LLVM project.
 
+The libutf project (https://github.com/cls/libutf) was used as an inspiration
+for some of the Unicode implementation bits within libostd and the gen\_unicode
+program used to generate the necessary ctype tables, but all of the code was
+written from scratch; nevertheless, I would like to give some credit and thanks
+to the project here.
+
 Additional thanks to Dale Weiler aka graphitemaster (reference code in the
 Neothyne project) and cppreference.com.
 
diff --git a/build.cc b/build.cc
index bdb43f8..13952cd 100644
--- a/build.cc
+++ b/build.cc
@@ -1,4 +1,9 @@
-/* A build system for libostd. */
+/* A simple build system to build libostd.
+ *
+ * This file is a part of the libostd project. Libostd is licensed under the
+ * University of Illinois/NCSA Open Source License, as is this file. See the
+ * COPYING.md file further information.
+ */
 
 /* for Windows so that we avoid dllimport/dllexport */
 #define OSTD_BUILD_LIB
diff --git a/gen_unicode.cc b/gen_unicode.cc
index 4e1ff68..998b1f7 100644
--- a/gen_unicode.cc
+++ b/gen_unicode.cc
@@ -1,16 +1,28 @@
+/* This simple program generates the tables necessary for Unicode character
+ * types. It's inspired by the mkrunetype.awk generator from the libutf
+ * project, see COPYING.md.
+ *
+ * This file is a part of the libostd project. Libostd is licensed under the
+ * University of Illinois/NCSA Open Source License, as is this file. See the
+ * COPYING.md file further information.
+ */
+
 #include <cstdint>
 #include <cctype>
 #include <vector>
+#include <array>
 #include <stdexcept>
 
 #include <ostd/io.hh>
 #include <ostd/string.hh>
 #include <ostd/algorithm.hh>
 
+using ostd::string_range;
+
 using code_t = std::uint32_t;
 using code_vec = std::vector<code_t>;
 
-code_t hex_to_code(ostd::string_range hs) {
+code_t hex_to_code(string_range hs) {
     code_t ret = 0;
     for (char c: hs) {
         if (!std::isxdigit(c |= 32)) {
@@ -38,8 +50,8 @@ struct parse_state {
         }
     }
 
-    void parse_line(ostd::string_range line) {
-        std::array<ostd::string_range, 15> bits;
+    void parse_line(string_range line) {
+        std::array<string_range, 15> bits;
         for (std::size_t n = 0;;) {
             auto sc = ostd::find(line, ';');
             if (!sc) {
@@ -96,6 +108,105 @@ struct parse_state {
         }
         /* good enough for now, ignore the rest */
     }
+
+    void build(code_vec const &codes, code_vec const &cases = code_vec{}) {
+        code_vec singles;
+        code_vec singles_cases;
+        code_vec ranges_beg;
+        code_vec ranges_end;
+        code_vec ranges_cases;
+        code_vec laces_beg[2];
+        code_vec laces_end[2];
+
+        if (!cases.empty() && (cases.size() != codes.size())) {
+            throw std::runtime_error{"mismatched code lists"};
+        }
+
+        auto match_pair = [&codes](std::size_t i, std::size_t off) {
+            return (
+                ((i + 1) < codes.size()) && (codes[i + 1] == (codes[i] + off))
+            );
+        };
+        auto match_range = [&codes, &cases, &match_pair](std::size_t i) {
+            return match_pair(i, 1) && (
+                cases.empty() || (cases[i + 1] == (cases[i] + 1))
+            );
+        };
+        auto match_lace = [
+            &codes, &cases, &match_pair
+        ](std::size_t i, int off) {
+            return match_pair(i, 2) && (cases.empty() || (
+                (cases[i + 1] == (codes[i + 1] + off)) &&
+                (cases[i    ] == (codes[i    ] + off))
+            ));
+        };
+
+        for (std::size_t i = 0, ncodes = codes.size(); i < ncodes; ++i) {
+            if (match_range(i)) {
+                ranges_beg.push_back(codes[i]);
+                if (!cases.empty()) {
+                    ranges_cases.push_back(cases[i]);
+                }
+                /* go to the end of sequence */
+                for (++i; match_range(i); ++i) {
+                    continue;
+                }
+                /* end of range, try others */
+                ranges_end.push_back(codes[i]);
+                continue;
+            }
+            if (size_t j = 0; match_lace(i, 1) || match_lace(i, -1)) {
+                laces_beg[j].push_back(codes[i]);
+                for (++i; match_lace(i, j); ++i) {
+                    continue;
+                }
+                laces_end[j].push_back(codes[i]);
+                continue;
+            }
+            singles.push_back(codes[i]);
+            if (!cases.empty()) {
+                singles_cases.push_back(cases[i]);
+            }
+        }
+
+        auto build_list = [](
+            string_range name, std::size_t ncol,
+            code_vec const &col1, code_vec const &col2, code_vec const &col3
+        ) {
+            if (col1.empty()) {
+                return;
+            }
+            ostd::writefln("%s:", name);
+            for (std::size_t i = 0; i < col1.size(); ++i) {
+                switch (ncol) {
+                    case 1:
+                        ostd::writefln("  0x%06X", col1[i]);
+                        break;
+                    case 2:
+                        ostd::writefln("  0x%06X, 0x%06X", col1[i], col2[i]);
+                        break;
+                    case 3:
+                        ostd::writefln(
+                            "  0x%06X, 0x%06X, 0x%06X",
+                            col1[i], col2[i], col3[i]
+                        );
+                        break;
+                    default:
+                        throw std::runtime_error{"invalid column number"};
+                        break;
+                }
+            }
+        };
+
+        build_list(
+            "ranges", !cases.empty() + 2, ranges_beg, ranges_end, ranges_cases
+        );
+        build_list("laces1", 2, laces_beg[0], laces_end[0], laces_beg[0]);
+        build_list("laces2", 2, laces_beg[1], laces_end[1], laces_beg[1]);
+        build_list(
+            "singles", !cases.empty() + 1, singles, singles_cases, singles
+        );
+    }
 };
 
 int main(int argc, char **argv) {
@@ -104,7 +215,7 @@ int main(int argc, char **argv) {
         return 1;
     }
 
-    ostd::string_range fname = argv[1];
+    string_range fname = argv[1];
     ostd::file_stream f{fname};
     if (!f.is_open()) {
         ostd::writefln("cannot open file '%s'", fname);
@@ -113,7 +224,27 @@ int main(int argc, char **argv) {
 
     parse_state ps;
 
-    for (auto const &line: f.iter_lines()) {
-        ps.parse_line(line);
+    try {
+        for (auto const &line: f.iter_lines()) {
+            ps.parse_line(line);
+        }
+    } catch (std::runtime_error const &e) {
+        ostd::writeln(e.what());
+        return 1;
     }
+
+    ostd::writeln("ALPHAS:");
+    ps.build(ps.alphas);
+    ostd::writeln("CONTROL:");
+    ps.build(ps.controls);
+    ostd::writeln("DIGITS:");
+    ps.build(ps.digits);
+    ostd::writeln("LOWERCASE:");
+    ps.build(ps.lowers, ps.touppers);
+    ostd::writeln("SPACES:");
+    ps.build(ps.spaces);
+    ostd::writeln("TITLES:");
+    ps.build(ps.titles);
+    ostd::writeln("UPPERCASE:");
+    ps.build(ps.uppers, ps.tolowers);
 }