expose unicode stuff through string slices

2017-12-31 19:12:51 +01:00 · 2017-12-31 19:12:51 +01:00 · d3cdbe2fcf
parent fb2f9e3b0e
commit d3cdbe2fcf
3 changed files with 34 additions and 8 deletions
--- a/ostd/format.hh
+++ b/ostd/format.hh
@ -257,7 +257,6 @@ namespace detail {

    inline int wc_to_mb_loc(wchar_t c, char *buf, std::locale const &loc) {
        auto &f = std::use_facet<std::codecvt<wchar_t, char, std::mbstate_t>>(loc);
-        std::mbstate_t mb{};
        return ac_to_mb(c, f, buf);
    }
 }
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -89,7 +89,7 @@ namespace utf {
     * The string is advanced past the UTF-8 character in the front.
     * If the decoding fails, `false` is returned, otherwise it's `true`.
     */
-    bool codepoint(string_range &r, char32_t &ret);
+    bool codepoint(string_range &r, char32_t &ret) noexcept;

    /* @brief Get the number of Unicode code points in a string.
     *
@ -102,7 +102,7 @@ namespace utf {
     * If you're sure the string is valid or you don't need to handle the
     * error, you can use the more convenient overload below.
     */
-    std::size_t length(string_range r, string_range &cont);
+    std::size_t length(string_range r, string_range &cont) noexcept;

    /* @brief Get the number of Unicode code points in a valid UTF-8 string.
     *
@ -112,7 +112,7 @@ namespace utf {
     * If you need to get the continuation string, use the general
     * error-handling overload of the function.
     */
-    std::size_t length(string_range r);
+    std::size_t length(string_range r) noexcept;
 } /* namespace utf */

 /** @brief A string slice type.
@ -279,6 +279,22 @@ public:
    /** @brief Gets the number of value_type in the slice. */
    size_type size() const noexcept { return p_end - p_beg; }

+    /** @brief Gets the number of code points in the slice.
+     *
+     * Effectively the same as utf::length().
+     */
+    size_type length() const noexcept {
+        return utf::length(*this);
+    }
+
+    /** @brief Gets the number of code points in the slice.
+     *
+     * Effectively the same as utf::length().
+     */
+    size_type length(basic_char_range &cont) const noexcept {
+        return utf::length(*this, cont);
+    }
+
    /** @brief Creates a sub-slice of the slice.
     *
     * Behavior is undefined if `start` and `end` are not within the
@ -374,6 +390,12 @@ diffsize:
        return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0);
    }

+    /** @brief Iterate over the code points of the string.
+     *
+     * Like utf::iter_codes().
+     */
+    inline auto iter_codes() const;
+
    /** @brief Implicitly converts a string slice to std::basic_string_view.
     *
     * String views represent more or less the same thing but they're always
@ -787,6 +809,11 @@ namespace utf {

 } /* namespace utf */

+template<typename T>
+inline auto basic_char_range<T>::iter_codes() const {
+    return utf::iter_codes(*this);
+}
+
 /* string literals */

 inline namespace literals {
--- a/src/string.cc
+++ b/src/string.cc
@ -11,7 +11,7 @@ namespace utf {

 constexpr std::uint32_t MaxCodepoint = 0x10FFFF;

-static inline bool codepoint_dec(string_range &r, char32_t &cret) {
+static inline bool codepoint_dec(string_range &r, char32_t &cret) noexcept {
    static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
    if (r.empty()) {
        return false;
@ -61,11 +61,11 @@ static inline bool codepoint_dec(string_range &r, char32_t &cret) {
    return true;
 }

-bool codepoint(string_range &r, char32_t &ret) {
+bool codepoint(string_range &r, char32_t &ret) noexcept {
    return codepoint_dec(r, ret);
 }

-std::size_t length(string_range r, string_range &cont) {
+std::size_t length(string_range r, string_range &cont) noexcept {
    std::size_t ret = 0;
    for (char32_t ch = U'\0'; codepoint_dec(r, ch); ++ret) {
        continue;
@ -74,7 +74,7 @@ std::size_t length(string_range r, string_range &cont) {
    return ret;
 }

-std::size_t length(string_range r) {
+std::size_t length(string_range r) noexcept {
    return length(r, r);
 }