add funcs to deal with decoding/encoding of wchar_t values/sequences

2018-01-05 19:25:20 +01:00 · 2018-01-05 19:25:20 +01:00 · 200919d96f
parent be25d42660
commit 200919d96f
2 changed files with 77 additions and 5 deletions
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -777,6 +777,16 @@ namespace utf {
        return true;
    }

+    /* @brief Get the Unicode code point for a wide Unicode char/sequence.
+     *
+     * The input is treated as either UTF-8, UTF-16 or UTF-32 depending
+     * on the size of the wide character. Typically, it will be UTF-16
+     * on Windows and UTF-32 on Unix-like systems, with UTF-32 taking
+     * priority (on systems where two or more of the types are the same
+     * size).
+     */
+    bool decode(wstring_range &r, char32_t &ret) noexcept;
+
    namespace detail {
        std::size_t u8_encode(
            char (&ret)[4], char32_t ch
@ -830,6 +840,48 @@ namespace utf {
        return n;
    }

+    /* @brief Encode a UTF-32 code point into a wide Unicode char/sequence.
+     *
+     * The value(s) are written in `sink` which is an ostd::output_range_tag.
+     * The written values are of type `wchar_t` and the amount written depends
+     * on the size of `wchar_t`.
+     *
+     * If `wchar_t` has equal size to `char32_t`, the input is simply type
+     * cast and written into the sink, treating `wchar_t` as UTF-32. If it
+     * has equal size to `char16_t` instead, `wchar_t` is treated as UTF-16
+     * and the input code point is encoded into one or two UTF-16 values.
+     * If neither of these happens, `wchar_t` is treated the same as `char`
+     * and the encoding is UTF-8, writing up to 4 code units.
+     *
+     * This function does not throw exceptions other than those thrown by
+     * `sink`. As for errors, with UTF-32 `wchar_t` it isn't allowed to
+     * fail; with UTF-8 or UTF-16, the failure points are the usual ones
+     * (surrogate code point as input or input greater than 0x10FFFF).
+     *
+     * The return value is the number of values written into the sink.
+     */
+    template<typename R>
+    inline std::size_t encode_uw(R &sink, char32_t ch) {
+        std::size_t n;
+        if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
+            n = 1;
+            sink.put(wchar_t(ch));
+        } else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
+            char16_t buf[2];
+            n = detail::u16_encode(buf, ch);
+            for (std::size_t i = 0; i < n; ++i) {
+                sink.put(wchar_t(buf[i]));
+            }
+        } else {
+            char buf[4];
+            n = detail::u8_encode(buf, ch);
+            for (std::size_t i = 0; i < n; ++i) {
+                sink.put(wchar_t(buf[i]));
+            }
+        }
+        return n;
+    }
+
    /* @brief Get the number of Unicode code points in a string.
     *
     * This function keeps reading Unicode code points while it can and
--- a/src/string.cc
+++ b/src/string.cc
@ -146,8 +146,7 @@ namespace detail {
 bool decode(string_range &r, char32_t &ret) noexcept {
    auto tn = r.size();
    auto *beg = reinterpret_cast<unsigned char const *>(r.data());
-    auto *end = beg + tn;
-    if (std::size_t n; (n = detail::u8_decode(beg, end, ret))) {
+    if (std::size_t n; (n = detail::u8_decode(beg, beg + tn, ret))) {
        r = r.slice(n, tn);
        return true;
    }
@ -156,9 +155,30 @@ bool decode(string_range &r, char32_t &ret) noexcept {

 bool decode(u16string_range &r, char32_t &ret) noexcept {
    auto tn = r.size();
-    auto *beg = reinterpret_cast<char16_t const *>(r.data());
-    auto *end = beg + tn;
-    if (std::size_t n; (n = detail::u16_decode(beg, end, ret))) {
+    auto *beg = r.data();
+    if (std::size_t n; (n = detail::u16_decode(beg, beg + tn, ret))) {
+        r = r.slice(n, tn);
+        return true;
+    }
+    return false;
+}
+
+bool decode(wstring_range &r, char32_t &ret) noexcept {
+    std::size_t n, tn = r.size();
+    if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
+        if (!tn) {
+            return false;
+        }
+        ret = char32_t(r.front());
+        return true;
+    } else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
+        auto *beg = reinterpret_cast<char16_t const *>(r.data());
+        n = detail::u16_decode(beg, beg + tn, ret);
+    } else {
+        auto *beg = reinterpret_cast<unsigned char const *>(r.data());
+        n = detail::u8_decode(beg, beg + tn, ret);
+    }
+    if (n) {
        r = r.slice(n, tn);
        return true;
    }