add by-utf8-byte iteration for all slice types

2018-01-06 01:59:13 +01:00 · 2018-01-06 01:59:13 +01:00 · d94e75cd18
parent 4a992d64b5
commit d94e75cd18
1 changed files with 105 additions and 6 deletions
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -353,6 +353,12 @@ public:
     */
    inline auto iter_codes() const;

+    /** @brief Iterate over the UTF-8 code units of the string.
+     *
+     * Like utf::iter_bytes().
+     */
+    inline auto iter_bytes() const;
+
    /** @brief Implicitly converts a string slice to std::basic_string_view.
     *
     * String views represent more or less the same thing but they're always
@ -986,9 +992,7 @@ namespace utf {

            codepoint_range() = delete;
            codepoint_range(basic_char_range<C const> r): p_range(r) {
-                if (r.empty()) {
-                    p_current = -1;
-                } else {
+                if (!r.empty()) {
                    advance();
                }
            }
@ -1019,7 +1023,57 @@ namespace utf {
            }

            basic_char_range<C const> p_range;
-            std::int32_t p_current;
+            std::int32_t p_current = -1;
+        };
+
+        template<typename C>
+        struct codeunit_range: input_range<codeunit_range<C>> {
+            using range_category = forward_range_tag;
+            using value_type = char;
+            using reference = char;
+            using size_type = std::size_t;
+
+            codeunit_range() = delete;
+            codeunit_range(basic_char_range<C const> r): p_range(r) {
+                if (!r.empty()) {
+                    advance();
+                }
+            }
+
+            bool empty() const { return p_left.empty(); }
+
+            void pop_front() {
+                std::size_t n = p_left.size();
+                if (n > 1) {
+                    p_left.pop_front();
+                    return;
+                }
+                if ((n == 1) && p_range.empty()) {
+                    p_left = char_range{};
+                    return;
+                }
+                advance();
+            }
+
+            char front() const {
+                return p_left.front();
+            }
+
+        private:
+            void advance() {
+                auto r = ostd::char_range(p_buf, p_buf + sizeof(p_buf));
+                if (std::size_t n; !(n = encode_u8(r, p_range))) {
+                    /* range is unchanged */
+                    p_left = char_range{};
+                    throw utf_error{"UTF-8 encoding failed"};
+                } else {
+                    p_left = ostd::char_range{p_buf, p_buf + n};
+                }
+            }
+
+            basic_char_range<C const> p_range;
+            char_range p_left = char_range{};
+            char p_buf[4];
        };
    } /* namespace detail */

@ -1034,15 +1088,55 @@ namespace utf {
        return detail::codepoint_range<char>{r};
    }

+    /** @brief Iterate over the code points of a UTF-16 string.
+     *
+     * The resulting range is ostd::forward_range_tag. The range will
+     * contain the code points of the given string. On error, which may
+     * be during any string advancement (the constructor or `pop_front()`),
+     * an ostd::utf_error is raised.
+     */
+    inline auto iter_codes(u16string_range r) noexcept {
+        return detail::codepoint_range<char16_t>{r};
+    }
+
    /** @brief Iterate over the code points of a UTF-32 string.
     *
-     * The resulting range is ostd::forward_range_tag. This cannot fail
-     * as it's essentially an identity range.
+     * The resulting range is ostd::forward_range_tag. This can actually
+     * fail just like the other ostd::iter_codes() variants if the string
+     * contains surrogates or code points that are out of bounds. If that
+     * happens, an ostd::utf_error is raised.
     */
    inline auto iter_codes(u32string_range r) noexcept {
        return detail::codepoint_range<char32_t>{r};
    }

+    /** @brief Iterate over the code points of a wide Unicode string.
+     *
+     * The resulting range is ostd::forward_range_tag. The range will
+     * contain the code points of the given string. On error, which may
+     * be during any string advancement (the constructor or `pop_front()`),
+     * an ostd::utf_error is raised.
+     */
+    inline auto iter_codes(wstring_range r) noexcept {
+        return detail::codepoint_range<wchar_t>{r};
+    }
+
+    inline auto iter_bytes(string_range r) {
+        return detail::codeunit_range<char>(r);
+    }
+
+    inline auto iter_bytes(u16string_range r) {
+        return detail::codeunit_range<char16_t>(r);
+    }
+
+    inline auto iter_bytes(u32string_range r) {
+        return detail::codeunit_range<char32_t>(r);
+    }
+
+    inline auto iter_bytes(wstring_range r) {
+        return detail::codeunit_range<wchar_t>(r);
+    }
+
    bool isalnum(char32_t c) noexcept;
    bool isalpha(char32_t c) noexcept;
    bool isblank(char32_t c) noexcept;
@ -1090,6 +1184,11 @@ inline auto basic_char_range<T>::iter_codes() const {
    return utf::iter_codes(*this);
 }

+template<typename T>
+inline auto basic_char_range<T>::iter_bytes() const {
+    return utf::iter_bytes(*this);
+}
+
 template<typename T>
 inline int basic_char_range<T>::case_compare(
    basic_char_range<T const> s