various traits and constants for unicode types

2018-01-07 17:13:53 +01:00 · 2018-01-07 17:13:53 +01:00 · 24d1b5ec25
parent be803bac7b
commit 24d1b5ec25
2 changed files with 96 additions and 47 deletions
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -629,6 +629,85 @@ namespace utf {
 * @{
 */

+    using wchar_fixed_t = std::conditional_t<
+        sizeof(wchar_t) == sizeof(char32_t),
+        char32_t,
+        std::conditional_t<
+            sizeof(wchar_t) == sizeof(char16_t),
+            char16_t,
+            char
+        >
+    >;
+
+    namespace detail {
+        template<typename C>
+        struct max_units_base;
+
+        template<>
+        struct max_units_base<char32_t> {
+            static constexpr std::size_t const value = 1;
+        };
+
+        template<>
+        struct max_units_base<char16_t> {
+            static constexpr std::size_t const value = 2;
+        };
+
+        template<>
+        struct max_units_base<char> {
+            static constexpr std::size_t const value = 4;
+        };
+
+        template<>
+        struct max_units_base<wchar_t>: max_units_base<wchar_fixed_t> {};
+    } /* namespace detail */
+
+    template<typename C>
+    static inline constexpr std::size_t const max_units =
+        detail::max_units_base<C>::value;
+
+    static inline constexpr bool const is_wchar_u32 =
+        std::is_same_v<wchar_fixed_t, char32_t>;
+
+    static inline constexpr bool const is_wchar_u16 =
+        std::is_same_v<wchar_fixed_t, char16_t>;
+
+    static inline constexpr bool const is_wchar_u8 =
+        std::is_same_v<wchar_fixed_t, char>;
+
+    namespace detail {
+        template<typename C>
+        struct is_character_base {
+            static constexpr bool const value = false;
+        };
+
+        template<>
+        struct is_character_base<char> {
+            static constexpr bool const value = true;
+        };
+
+        template<>
+        struct is_character_base<char16_t> {
+            static constexpr bool const value = true;
+        };
+
+        template<>
+        struct is_character_base<char32_t> {
+            static constexpr bool const value = true;
+        };
+
+        template<>
+        struct is_character_base<wchar_t> {
+            static constexpr bool const value = true;
+        };
+    }
+
+    template<typename C>
+    static inline constexpr bool const is_character =
+        detail::is_character_base<C>::value;
+
+    static inline constexpr char32_t const max_unicode = 0x10FFFF;
+
    /** @brief Thrown on UTF-8 decoding failure. */
    struct utf_error: std::runtime_error {
        using std::runtime_error::runtime_error;
@ -739,10 +818,7 @@ namespace utf {
         * actually decodes; in both cases it encodes to utf-8,
         * for utf-8 the whole thing is just an advancing wrapper
         */
-        if constexpr(
-            (sizeof(wchar_t) == sizeof(char32_t)) ||
-            (sizeof(wchar_t) == sizeof(char16_t))
-        ) {
+        if constexpr(is_wchar_u32 || is_wchar_u16) {
            auto rr = r;
            if (char32_t ch; utf::decode(rr, ch)) {
                if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
@ -821,10 +897,7 @@ namespace utf {
        /* when wchar_t is guaranteed utf-16, we have an identity
         * match so we just advance; otherwise decode and encode
         */
-        if constexpr(
-            (sizeof(wchar_t) != sizeof(char32_t)) &&
-            (sizeof(wchar_t) == sizeof(char16_t))
-        ) {
+        if constexpr(is_wchar_u16) {
            if (!r.empty()) {
                sink.put(char16_t(r.front()));
                r.pop_front();
@ -874,10 +947,10 @@ namespace utf {
    template<typename R>
    inline std::size_t encode_uw(R &sink, char32_t ch) {
        std::size_t n;
-        if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
+        if constexpr(is_wchar_u32) {
            n = 1;
            sink.put(wchar_t(ch));
-        } else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
+        } else if constexpr(is_wchar_u16) {
            char16_t buf[2];
            n = detail::u16_encode(buf, ch);
            for (std::size_t i = 0; i < n; ++i) {
@ -909,10 +982,7 @@ namespace utf {
         * match much like encode_u16 with wstring, otherwise
         * decode and encode
         */
-        if constexpr(
-            (sizeof(wchar_t) != sizeof(char32_t)) &&
-            (sizeof(wchar_t) == sizeof(char16_t))
-        ) {
+        if constexpr(is_wchar_u16) {
            if (!r.empty()) {
                sink.put(wchar_t(r.front()));
                r.pop_front();
@ -936,10 +1006,7 @@ namespace utf {
         * match so there is no reencoding, otherwise decode and
         * encode...
         */
-        if constexpr(
-            (sizeof(wchar_t) != sizeof(char32_t)) &&
-            (sizeof(wchar_t) != sizeof(char16_t))
-        ) {
+        if constexpr(is_wchar_u8) {
            if (!r.empty()) {
                sink.put(wchar_t(r.front()));
                r.pop_front();
@ -972,12 +1039,7 @@ namespace utf {
    inline std::size_t encode(
        [[maybe_unused]] OR &sink, [[maybe_unused]] IR &r
    ) {
-        static_assert(
-            std::is_same_v<C, char32_t> ||
-            std::is_same_v<C, char16_t> ||
-            std::is_same_v<C, char> ||
-            std::is_same_v<C, wchar_t>, "Invalid input type"
-        );
+        static_assert(is_character<C>, "Invalid input type");
        if constexpr(std::is_same_v<C, char32_t>) {
            return encode_u32(sink, r);
        } else if constexpr(std::is_same_v<C, char16_t>) {
@ -1059,7 +1121,7 @@ namespace utf {

        private:
            void advance() {
-                auto r = basic_char_range<OC>(p_buf, p_buf + sizeof(p_buf));
+                auto r = basic_char_range<OC>(p_buf, p_buf + max_units<OC>);
                if (std::size_t n; !(n = utf::encode<OC>(r, p_range))) {
                    /* range is unchanged */
                    p_left = basic_char_range<OC>{};
@ -1071,7 +1133,7 @@ namespace utf {

            basic_char_range<IC const> p_range;
            basic_char_range<OC> p_left{};
-            OC p_buf[4];
+            OC p_buf[max_units<OC>];
        };
    } /* namespace detail */

--- a/src/string.cc
+++ b/src/string.cc
@ -14,11 +14,9 @@ namespace utf {
 /* place the vtable in here */
 utf_error::~utf_error() {}

-constexpr char32_t MaxCodepoint = 0x10FFFF;
-
 namespace detail {
    inline bool is_invalid_u32(char32_t c) {
-        return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > MaxCodepoint));
+        return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > utf::max_unicode));
    }

    static inline std::size_t u8_decode(
@ -118,7 +116,7 @@ namespace detail {
            ret[2] = char(0x80 |  (ch       & 0x3F));
            return 3;
        }
-        if (ch <= MaxCodepoint) {
+        if (ch <= utf::max_unicode) {
            ret[0] = char(0xF0 |  (ch >> 18));
            ret[1] = char(0x80 | ((ch >> 12) | 0x3F));
            ret[2] = char(0x80 | ((ch >>  6) | 0x3F));
@ -132,7 +130,7 @@ namespace detail {
        char16_t (&ret)[2], char32_t ch
    ) noexcept {
        /* surrogate code point or out of bounds */
-        if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > MaxCodepoint)) {
+        if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > utf::max_unicode)) {
            return 0;
        }
        if (ch <= 0xFFFF) {
@ -161,10 +159,7 @@ namespace detail {
    template<typename C>
    inline std::size_t length(basic_char_range<C const> &r) noexcept {
        std::size_t ret = 0;
-        if constexpr(std::is_same_v<C, char32_t> || (
-            std::is_same_v<C, wchar_t> &&
-            (sizeof(wchar_t) == sizeof(char32_t))
-        )) {
+        if constexpr(utf::max_units<C> == 1) {
            ret = r.size();
        } else {
            for (;; ++ret) {
@ -215,7 +210,7 @@ bool decode(u32string_range &r, char32_t &ret) noexcept {

 bool decode(wstring_range &r, char32_t &ret) noexcept {
    std::size_t n, tn = r.size();
-    if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
+    if constexpr(is_wchar_u32) {
        if (!tn) {
            return false;
        }
@ -226,7 +221,7 @@ bool decode(wstring_range &r, char32_t &ret) noexcept {
        ret = c;
        r.pop_front();
        return true;
-    } else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
+    } else if constexpr(is_wchar_u16) {
        auto *beg = reinterpret_cast<char16_t const *>(r.data());
        n = detail::u16_decode(beg, beg + tn, ret);
    } else {
@ -319,7 +314,7 @@ bool isvalid(char32_t c) noexcept {
        return false;
    }
    /* must be within range */
-    return (c <= MaxCodepoint);
+    return (c <= utf::max_unicode);
 }

 bool isxdigit(char32_t c) noexcept {
@ -567,15 +562,7 @@ int case_compare(u32string_range s1, u32string_range s2) noexcept {
 }

 int case_compare(wstring_range s1, wstring_range s2) noexcept {
-    using C = std::conditional_t<
-        sizeof(wchar_t) == sizeof(char32_t),
-        char32_t,
-        std::conditional_t<
-            sizeof(wchar_t) == sizeof(char16_t),
-            char16_t,
-            unsigned char
-        >
-    >;
+    using C = std::conditional_t<is_wchar_u8, unsigned char, wchar_fixed_t>;
    auto *beg1 = reinterpret_cast<C const *>(s1.data());
    auto *beg2 = reinterpret_cast<C const *>(s2.data());
    return detail::case_compare(beg1, beg1 + s1.size(), beg2, beg2 + s2.size());