diff --git a/ostd/string.hh b/ostd/string.hh index ed9a731..6d43b2c 100644 --- a/ostd/string.hh +++ b/ostd/string.hh @@ -744,6 +744,34 @@ namespace utf { */ bool decode(string_range &r, char32_t &ret) noexcept; + namespace detail { + std::uint8_t u8_encode( + std::uint8_t (&ret)[4], std::uint32_t ch + ) noexcept; + } + + /* @brief Encode a UTF-32 code point into UTF-8 code units. + * + * The units are written in `sink` which is an ostd::output_range_tag. + * The written values are of type `char` and up to 4 are written. The + * number of bytes written is returned from the function. In case of + * failure, `0` is returned. + * + * This function is allowed to fail only in two cases, when a surrogate + * code point is provided or when the code point is out of bounds as + * defined by Unicode (i.e. 0x10FFFF). It does not throw exceptions + * other than those thrown by `sink`. + */ + template + std::uint8_t encode_u8(R &sink, char32_t ch) { + std::uint8_t buf[4]; + std::uint8_t n = detail::u8_encode(buf, ch); + for (std::uint8_t i = 0; i < n; ++i) { + sink.put(buf[i]); + } + return n; + } + /* @brief Get the number of Unicode code points in a string. * * This function keeps reading Unicode code points while it can and diff --git a/src/string.cc b/src/string.cc index 49ef977..c79b76c 100644 --- a/src/string.cc +++ b/src/string.cc @@ -11,63 +11,99 @@ namespace utf { constexpr std::uint32_t MaxCodepoint = 0x10FFFF; -static inline bool u8_decode(string_range &r, char32_t &cret) noexcept { - static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF }; - if (r.empty()) { - return false; - } - std::uint32_t ch = static_cast(r.front()); - if (ch <= 0x7F) { - /* ASCII */ - cret = ch; - r.pop_front(); +namespace detail { + static inline bool u8_decode(string_range &r, char32_t &cret) noexcept { + static std::uint32_t const ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF }; + if (r.empty()) { + return false; + } + std::uint32_t ch = static_cast(r.front()); + if (ch <= 0x7F) { + /* ASCII */ + cret = ch; + r.pop_front(); + return true; + } + std::uint32_t ret = 0; + string_range sr = r; + sr.pop_front(); + /* continuation bytes */ + for (; ch & 0x40; ch <<= 1) { + /* need a continuation byte but nothing left in the string */ + if (sr.empty()) { + return false; + } + /* the continuation byte */ + std::uint32_t nch = static_cast(sr.front()); + sr.pop_front(); + /* lower 6 bits */ + std::uint32_t bch = nch & 0x3F; + /* not a continuation byte */ + if ((nch ^ bch) != 0x80) { + return false; + } + /* the 6 bits go in the result */ + ret = (ret << 6) | bch; + } + /* number of continuation bytes */ + std::size_t n = sr.data() - r.data() - 1; + /* invalid sequence - too many continuation bits */ + if (n > 3) { + return false; + } + /* add the up to 7 bits from the first byte, already shifted left by n */ + ret |= (ch & 0x7F) << (n * 5); + /* invalid sequence - out of bounds */ + if ((ret > MaxCodepoint) || (ret <= ulim[n])) { + return false; + } + cret = ret; + r = sr; return true; } - std::uint32_t ret = 0; - string_range sr = r; - sr.pop_front(); - /* continuation bytes */ - for (; ch & 0x40; ch <<= 1) { - /* need a continuation byte but nothing left in the string */ - if (sr.empty()) { - return false; + + std::uint8_t u8_encode( + std::uint8_t (&ret)[4], std::uint32_t ch + ) noexcept { + if (ch <= 0x7F) { + ret[0] = ch; + return 1; } - /* the continuation byte */ - std::uint32_t nch = static_cast(sr.front()); - sr.pop_front(); - /* lower 6 bits */ - std::uint32_t bch = nch & 0x3F; - /* not a continuation byte */ - if ((nch ^ bch) != 0x80) { - return false; + if (ch <= 0x7FF) { + ret[0] = 0xC0 | (ch >> 6); + ret[1] = 0x80 | (ch & 0x3F); + return 2; } - /* the 6 bits go in the result */ - ret = (ret << 6) | bch; + if (ch <= 0xFFFF) { + /* TODO: optional WTF-8 semantics + * for now simply reject surrogate code points + */ + if ((ch & 0xD800) == 0xD800) { + return 0; + } + ret[0] = 0xE0 | (ch >> 12); + ret[1] = 0x80 | ((ch >> 6) & 0x3F); + ret[2] = 0x80 | (ch & 0x3F); + return 3; + } + if (ch <= MaxCodepoint) { + ret[0] = 0xF0 | (ch >> 18); + ret[1] = 0x80 | ((ch >> 12) | 0x3F); + ret[2] = 0x80 | ((ch >> 6) | 0x3F); + ret[3] = 0x80 | (ch | 0x3F); + return 4; + } + return 0; } - /* number of continuation bytes */ - std::size_t n = sr.data() - r.data() - 1; - /* invalid sequence - too many continuation bits */ - if (n > 3) { - return false; - } - /* add the up to 7 bits from the first byte, already shifted left by n */ - ret |= (ch & 0x7F) << (n * 5); - /* invalid sequence - out of bounds */ - if ((ret > MaxCodepoint) || (ret <= ulim[n])) { - return false; - } - cret = ret; - r = sr; - return true; -} +} /* namespace detail */ bool decode(string_range &r, char32_t &ret) noexcept { - return u8_decode(r, ret); + return detail::u8_decode(r, ret); } std::size_t length(string_range r, string_range &cont) noexcept { std::size_t ret = 0; - for (char32_t ch = U'\0'; u8_decode(r, ch); ++ret) { + for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) { continue; } cont = r;