direct encoding funcs to u8/u16/uw from any other UTF

2018-01-05 21:49:00 +01:00 · 2018-01-05 21:49:00 +01:00 · 58ccfbe276
parent daea42666e
commit 58ccfbe276
2 changed files with 172 additions and 48 deletions
--- a/ostd/format.hh
+++ b/ostd/format.hh
@ -1022,52 +1022,6 @@ private:
        }
    }

-    template<typename R>
-    void write_seq(R &writer, string_range &val) const {
-        writer.put(val.front());
-        val.pop_front();
-    }
-
-    template<typename R>
-    void write_seq(R &writer, u16string_range &val) const {
-        if (char32_t ret; !utf::decode(val, ret)) {
-            write_replacement(writer);
-            val.pop_front();
-        } else {
-             if (!utf::encode_u8(writer, ret)) {
-                write_replacement(writer);
-             }
-        }
-    }
-
-    template<typename R>
-    void write_seq(R &writer, u32string_range &val) const {
-        if (!utf::encode_u8(writer, val.front())) {
-            write_replacement(writer);
-        }
-        val.pop_front();
-    }
-
-    template<typename R>
-    void write_seq(R &writer, wstring_range &val) const {
-        if constexpr(
-            (sizeof(wchar_t) == sizeof(char32_t)) ||
-            (sizeof(wchar_t) == sizeof(char16_t))
-        ) {
-            if (char32_t ret; !utf::decode(val, ret)) {
-                write_replacement(writer);
-                val.pop_front();
-            } else {
-                if (!utf::encode_u8(writer, ret)) {
-                    write_replacement(writer);
-                }
-            }
-        } else {
-            writer.put(char(val.front()));
-            val.pop_front();
-        }
-    }
-
    /* string base writer */
    template<typename C, typename R>
    void write_str(R &writer, bool escape, basic_char_range<C const> val) const {
@ -1091,8 +1045,9 @@ private:
                        write_char_raw(writer, c);
                    }
                    val.pop_front();
-                } else {
-                    write_seq(writer, val);
+                } else if (!utf::encode_u8(writer, val)) {
+                    write_replacement(writer);
+                    val.pop_front();
                }
            }
            writer.put('"');
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -820,6 +820,7 @@ namespace utf {

    template<typename R>
    inline std::size_t encode_u8(R &sink, u32string_range &r) {
+        /* just a wrapper; does the same thing but advances */
        std::size_t n = 0;
        if (!r.empty() && (n = utf::encode_u8(sink, r.front()))) {
            r.pop_front();
@ -827,6 +828,57 @@ namespace utf {
        return n;
    }

+    template<typename R>
+    inline std::size_t encode_u8(R &sink, u16string_range &r) {
+        /* decodes to code point and encodes */
+        auto rr = r;
+        if (char32_t ch; utf::decode(rr, ch)) {
+            if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
+                r = rr;
+                return ret;
+            }
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_u8(R &sink, string_range &r) {
+        /* identity match, advances */
+        if (!r.empty()) {
+            sink.put(r.front());
+            r.pop_front();
+            return 1;
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_u8(R &sink, wstring_range &r) {
+        /* for utf-32, decode is just a swapper, for utf-16 it
+         * actually decodes; in both cases it encodes to utf-8,
+         * for utf-8 the whole thing is just an advancing wrapper
+         */
+        if constexpr(
+            (sizeof(wchar_t) == sizeof(char32_t)) ||
+            (sizeof(wchar_t) == sizeof(char16_t))
+        ) {
+            auto rr = r;
+            if (char32_t ch; utf::decode(rr, ch)) {
+                if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
+                    r = rr;
+                    return ret;
+                }
+            }
+        } else {
+            if (!r.empty()) {
+                sink.put(char(r.front()));
+                r.pop_front();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
    /* @brief Encode a UTF-32 code point into UTF-16.
     *
     * The values are written in `sink` which is an ostd::output_range_tag.
@ -851,6 +903,7 @@ namespace utf {

    template<typename R>
    inline std::size_t encode_u16(R &sink, u32string_range &r) {
+        /* just a wrapper; does the same thing but advances */
        std::size_t n = 0;
        if (!r.empty() && (n = utf::encode_u16(sink, r.front()))) {
            r.pop_front();
@ -858,6 +911,56 @@ namespace utf {
        return n;
    }

+    template<typename R>
+    inline std::size_t encode_u16(R &sink, u16string_range &r) {
+        /* identity match, advances */
+        if (!r.empty()) {
+            sink.put(r.front());
+            r.pop_front();
+            return 1;
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_u16(R &sink, string_range &r) {
+        /* has to decode and encode */
+        auto rr = r;
+        if (char32_t ch; utf::decode(rr, ch)) {
+            if (std::size_t ret; (ret = utf::encode_u16(sink, ch))) {
+                r = rr;
+                return ret;
+            }
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_u16(R &sink, wstring_range &r) {
+        /* when wchar_t is guaranteed utf-16, we have an identity
+         * match so we just advance; otherwise decode and encode
+         */
+        if constexpr(
+            (sizeof(wchar_t) != sizeof(char32_t)) &&
+            (sizeof(wchar_t) == sizeof(char16_t))
+        ) {
+            if (!r.empty()) {
+                sink.put(char16_t(r.front()));
+                r.pop_front();
+                return 1;
+            }
+        } else {
+            auto rr = r;
+            if (char32_t ch; utf::decode(rr, ch)) {
+                if (std::size_t ret; (ret = utf::encode_u16(sink, ch))) {
+                    r = rr;
+                    return ret;
+                }
+            }
+        }
+        return 0;
+    }
+
    /* @brief Encode a UTF-32 code point into a wide Unicode char/sequence.
     *
     * The value(s) are written in `sink` which is an ostd::output_range_tag.
@ -902,6 +1005,7 @@ namespace utf {

    template<typename R>
    inline std::size_t encode_uw(R &sink, u32string_range &r) {
+        /* just a wrapper; does the same thing but advances */
        std::size_t n = 0;
        if (!r.empty() && (n = utf::encode_uw(sink, r.front()))) {
            r.pop_front();
@ -909,6 +1013,71 @@ namespace utf {
        return n;
    }

+    template<typename R>
+    inline std::size_t encode_uw(R &sink, u16string_range &r) {
+        /* when wchar_t is guaranteed utf-16, we have an identity
+         * match much like encode_u16 with wstring, otherwise
+         * decode and encode
+         */
+        if constexpr(
+            (sizeof(wchar_t) != sizeof(char32_t)) &&
+            (sizeof(wchar_t) == sizeof(char16_t))
+        ) {
+            if (!r.empty()) {
+                sink.put(wchar_t(r.front()));
+                r.pop_front();
+                return 1;
+            }
+        } else {
+            auto rr = r;
+            if (char32_t ch; utf::decode(rr, ch)) {
+                if (std::size_t ret; (ret = utf::encode_uw(sink, ch))) {
+                    r = rr;
+                    return ret;
+                }
+            }
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_uw(R &sink, string_range &r) {
+        /* when wchar_t is guaranteed utf-8, we have an identity
+         * match so there is no reencoding, otherwise decode and
+         * encode...
+         */
+        if constexpr(
+            (sizeof(wchar_t) != sizeof(char32_t)) &&
+            (sizeof(wchar_t) != sizeof(char16_t))
+        ) {
+            if (!r.empty()) {
+                sink.put(wchar_t(r.front()));
+                r.pop_front();
+                return 1;
+            }
+        } else {
+            auto rr = r;
+            if (char32_t ch; utf::decode(rr, ch)) {
+                if (std::size_t ret; (ret = utf::encode_uw(sink, ch))) {
+                    r = rr;
+                    return ret;
+                }
+            }
+        }
+        return 0;
+    }
+
+    template<typename R>
+    inline std::size_t encode_uw(R &sink, wstring_range &r) {
+        /* identity match, advances */
+        if (!r.empty()) {
+            sink.put(wchar_t(r.front()));
+            r.pop_front();
+            return 1;
+        }
+        return 0;
+    }
+
    /* @brief Get the number of Unicode code points in a string.
     *
     * This function keeps reading Unicode code points while it can and