relax the rules of zero-argument utf::length

2018-01-06 00:27:04 +01:00 · 2018-01-06 00:27:04 +01:00 · aeb5023b30
parent e5162233d4
commit aeb5023b30
2 changed files with 21 additions and 8 deletions
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -1104,15 +1104,15 @@ namespace utf {

    /* @brief Get the number of Unicode code points in a valid UTF-8 string.
     *
-     * If an invalid UTF-8 sequence is encountered, it returns the length
-     * until that sequence.
+     * If an invalid UTF-8 sequence is encountered, it's considered
+     * 1 character and therefore the resulting length will be the
+     * number of valid code points plus the number of invalid
+     * code units as if they were replaced with valid code points.
     *
-     * If you need to get the continuation string, use the general
-     * error-handling overload of the function.
+     * If you need to stop at an invalid code unit and get the
+     * continuation string, use the overload above.
     */
-    inline std::size_t length(string_range r) noexcept {
-        return utf::length(r, r);
-    }
+    std::size_t length(string_range r) noexcept;

    /* @brief Get the number of Unicode code points in a UTF-32 string.
     *
--- a/src/string.cc
+++ b/src/string.cc
@ -191,13 +191,26 @@ bool decode(wstring_range &r, char32_t &ret) noexcept {

 std::size_t length(string_range r, string_range &cont) noexcept {
    std::size_t ret = 0;
-    for (char32_t ch = U'\0'; utf::decode(r, ch); ++ret) {
+    for (char32_t ch; utf::decode(r, ch); ++ret) {
        continue;
    }
    cont = r;
    return ret;
 }

+std::size_t length(string_range r) noexcept {
+    std::size_t ret = 0;
+    for (;; ++ret) {
+        if (char32_t ch; !utf::decode(r, ch)) {
+            if (r.empty()) {
+                break;
+            }
+            r.pop_front();
+        }
+    }
+    return ret;
+}
+
 /* unicode-aware ctype
 * the other ones use custom tables for lookups
 */