add utf-16 decoding/encoding support

master
Daniel Kolesa 2018-01-05 02:18:36 +01:00
parent 723c06c612
commit d74736d8f4
2 changed files with 81 additions and 2 deletions

View File

@ -742,13 +742,20 @@ namespace utf {
using std::runtime_error::runtime_error;
};
/* @brief Get the Unicode code point for a multibyte sequence.
/* @brief Get the Unicode code point for a UTF-8 sequence.
*
* The string is advanced past the Unicode character in the front.
* If the decoding fails, `false` is returned, otherwise it's `true`.
*/
bool decode(string_range &r, char32_t &ret) noexcept;
/* @brief Get the Unicode code point for a UTF-16 sequence.
*
* The string is advanced past the Unicode character in the front.
* If the decoding fails, `false` is returned, otherwise it's `true`.
*/
bool decode(u16string_range &r, char32_t &ret) noexcept;
/* @brief Get the Unicode code point from a UTF-32 string.
*
* The string is advanced by one. This can only fail if the string
@ -767,6 +774,9 @@ namespace utf {
std::uint8_t u8_encode(
std::uint8_t (&ret)[4], std::uint32_t ch
) noexcept;
std::uint8_t u16_encode(
std::uint16_t (&ret)[2], std::uint32_t ch
) noexcept;
}
/* @brief Encode a UTF-32 code point into UTF-8 code units.
@ -782,7 +792,7 @@ namespace utf {
* other than those thrown by `sink`.
*/
template<typename R>
std::uint8_t encode_u8(R &sink, char32_t ch) {
inline std::uint8_t encode_u8(R &sink, char32_t ch) {
std::uint8_t buf[4];
std::uint8_t n = detail::u8_encode(buf, ch);
for (std::uint8_t i = 0; i < n; ++i) {
@ -791,6 +801,28 @@ namespace utf {
return n;
}
/* @brief Encode a UTF-32 code point into UTF-16.
*
* The values are written in `sink` which is an ostd::output_range_tag.
* The written values are of type `char16_t` and up to 2 are written.
* The number of values written is returned from the function. In case
* of failure, `0` is returned.
*
* This function is allowed to fail only in two cases, when a surrogate
* code point is provided or when the code point is out of bounds as
* defined by Unicode (i.e. 0x10FFFF). It does not throw exceptions
* other than those thrown by `sink`.
*/
template<typename R>
inline std::uint8_t encode_u16(R &sink, char32_t ch) {
std::uint16_t buf[2];
std::uint8_t n = detail::u16_encode(buf, ch);
for (std::uint8_t i = 0; i < n; ++i) {
sink.put(buf[i]);
}
return n;
}
/* @brief Get the number of Unicode code points in a string.
*
* This function keeps reading Unicode code points while it can and

View File

@ -67,6 +67,31 @@ namespace detail {
return true;
}
static inline bool u16_decode(u16string_range &r, char32_t &cret) noexcept {
if (r.empty()) {
return false;
}
std::uint32_t ch = r.front();
/* lead surrogate code point */
if ((ch >= 0xD800) && (ch <= 0xDBFF)) {
/* unpaired */
if (r.size() < 2) {
return false;
}
std::uint32_t nch = r[1];
/* trail surrogate code point */
bool trail = ((nch >= 0xDC00) && (nch <= 0xDFFF));
if (trail) {
r = r.slice(2, r.size());
cret = 0x10000 + (((ch - 0xD800) << 10) | (nch - 0xDC00));
}
return trail;
}
r.pop_front();
cret = ch;
return true;
}
std::uint8_t u8_encode(
std::uint8_t (&ret)[4], std::uint32_t ch
) noexcept {
@ -100,12 +125,34 @@ namespace detail {
}
return 0;
}
std::uint8_t u16_encode(
std::uint16_t (&ret)[2], std::uint32_t ch
) noexcept {
/* surrogate code point or out of bounds */
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > MaxCodepoint)) {
return 0;
}
if (ch <= 0xFFFF) {
ret[0] = std::uint16_t(ch);
return 1;
}
/* 20-bit number */
ch -= 0x10000;
ret[0] = std::uint16_t(0xD800 + (ch >> 10));
ret[1] = std::uint16_t(0xDC00 + (ch & 0x3FF));
return 2;
}
} /* namespace detail */
bool decode(string_range &r, char32_t &ret) noexcept {
return detail::u8_decode(r, ret);
}
bool decode(u16string_range &r, char32_t &ret) noexcept {
return detail::u16_decode(r, ret);
}
std::size_t length(string_range r, string_range &cont) noexcept {
std::size_t ret = 0;
for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) {