add some initial code for upcoming unicode support

2017-12-31 03:00:23 +01:00 · 2017-12-31 03:00:23 +01:00 · 3c75d7db98
parent f7929a1b45
commit 3c75d7db98
2 changed files with 120 additions and 11 deletions
--- a/ostd/string.hh
+++ b/ostd/string.hh
@ -68,6 +68,44 @@ namespace ostd {
 * @{
 */

+template<typename T> struct basic_char_range;
+
+/** @brief A mutable slice over `char`. */
+using char_range = basic_char_range<char>;
+
+/** @brief An immutable slice over `char`.
+ *
+ * This is used in most libostd APIs that read strings. More or less
+ * anything is convertible to it, including mutable slices, so it's
+ * a perfect fit as long as modifications are not necessary.
+ */
+using string_range = basic_char_range<char const>;
+
+namespace utf {
+    /* @brief Get the number of Unicode code points in a valid UTF-8 string.
+     *
+     * If an invalid UTF-8 sequence is encountered, it returns the length
+     * until that sequence.
+     *
+     * If you need to get the continuation string, use the error-handling
+     * overload of the function.
+     */
+    std::size_t length(string_range r);
+
+    /* @brief Get the number of Unicode code points in a string.
+     *
+     * This function keeps reading Unicode code points while it can and
+     * once it can't it returns the number of valid ones with the rest
+     * of the input string range being in `cont`. That means if the entire
+     * string is a valid UTF-8 string, `cont` will be empty, otherwise it
+     * will begin at the first unvalid UTF-8 code point.
+     *
+     * If you're sure the string is valid or you don't need to handle the
+     * error, you can use the more convenient overload above.
+     */
+    std::size_t length(string_range r, string_range &cont);
+} /* namespace utf */
+
 /** @brief A string slice type.
 *
 * This is a contiguous range over a character type. The character type
@ -344,17 +382,6 @@ private:
    T *p_beg, *p_end;
 };

-/** @brief A mutable slice over `char`. */
-using char_range = basic_char_range<char>;
-
-/** @brief An immutable slice over `char`.
- *
- * This is used in most libostd APIs that read strings. More or less
- * anything is convertible to it, including mutable slices, so it's
- * a perfect fit as long as modifications are not necessary.
- */
-using string_range = basic_char_range<char const>;
-
 /* comparisons between ranges */

 /** @brief Like `!lhs.compare(rhs)`. */
--- a/src/string.cc
+++ b/src/string.cc
@ -0,0 +1,82 @@
+/* String implementation details, mainly regarding Unicode support.
+ *
+ * This file is part of libostd. See COPYING.md for futher information.
+ */
+
+#include <cstdint>
+#include "ostd/string.hh"
+
+namespace ostd {
+namespace utf {
+
+constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
+
+static inline bool is_u8cont(std::uint32_t ch) {
+    return (ch & 0xC0) == 0x80;
+}
+
+static inline bool codepoint(string_range &r, char32_t &cret) {
+    static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
+    if (r.empty()) {
+        return false;
+    }
+    std::uint32_t ch = static_cast<unsigned char const>(r.front());
+    if (ch <= 0x7F) {
+        /* ASCII */
+        cret = ch;
+        r.pop_front();
+        return true;
+    }
+    std::uint32_t ret = 0;
+    string_range sr = r;
+    sr.pop_front();
+    /* continuation bytes */
+    for (; ch & 0x40; ch <<= 1) {
+        /* need a continuation byte but nothing left in the string */
+        if (sr.empty()) {
+            return false;
+        }
+        /* the continuation byte */
+        std::uint32_t nch = static_cast<unsigned char const>(sr.front());
+        sr.pop_front();
+        /* lower 6 bits */
+        std::uint32_t bch = nch & 0x3F;
+        /* not a continuation byte */
+        if ((nch ^ bch) != 0x80) {
+            return false;
+        }
+        /* the 6 bits go in the result */
+        ret = (ret << 6) | bch;
+    }
+    /* number of continuation bytes */
+    std::size_t n = sr.data() - r.data();
+    /* invalid sequence - too many continuation bits */
+    if (n > 3) {
+        return false;
+    }
+    /* add the up to 7 bits from the first byte, already shifted left by n */
+    ret |= (ch & 0x7F) << (n * 5);
+    /* invalid sequence - out of bounds */
+    if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
+        return false;
+    }
+    cret = ret;
+    r = sr;
+    return true;
+}
+
+std::size_t length(string_range r, string_range &cont) {
+    std::size_t ret = 0;
+    for (char32_t ch = U'\0'; codepoint(r, ch); ++ret) {
+        continue;
+    }
+    cont = r;
+    return ret;
+}
+
+std::size_t length(string_range r) {
+    return length(r, r);
+}
+
+} /* namespace utf */
+} /* namespace ostd */