add utf-16 decoding/encoding support
parent
723c06c612
commit
d74736d8f4
|
@ -742,13 +742,20 @@ namespace utf {
|
|||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
/* @brief Get the Unicode code point for a multibyte sequence.
|
||||
/* @brief Get the Unicode code point for a UTF-8 sequence.
|
||||
*
|
||||
* The string is advanced past the Unicode character in the front.
|
||||
* If the decoding fails, `false` is returned, otherwise it's `true`.
|
||||
*/
|
||||
bool decode(string_range &r, char32_t &ret) noexcept;
|
||||
|
||||
/* @brief Get the Unicode code point for a UTF-16 sequence.
|
||||
*
|
||||
* The string is advanced past the Unicode character in the front.
|
||||
* If the decoding fails, `false` is returned, otherwise it's `true`.
|
||||
*/
|
||||
bool decode(u16string_range &r, char32_t &ret) noexcept;
|
||||
|
||||
/* @brief Get the Unicode code point from a UTF-32 string.
|
||||
*
|
||||
* The string is advanced by one. This can only fail if the string
|
||||
|
@ -767,6 +774,9 @@ namespace utf {
|
|||
std::uint8_t u8_encode(
|
||||
std::uint8_t (&ret)[4], std::uint32_t ch
|
||||
) noexcept;
|
||||
std::uint8_t u16_encode(
|
||||
std::uint16_t (&ret)[2], std::uint32_t ch
|
||||
) noexcept;
|
||||
}
|
||||
|
||||
/* @brief Encode a UTF-32 code point into UTF-8 code units.
|
||||
|
@ -782,7 +792,7 @@ namespace utf {
|
|||
* other than those thrown by `sink`.
|
||||
*/
|
||||
template<typename R>
|
||||
std::uint8_t encode_u8(R &sink, char32_t ch) {
|
||||
inline std::uint8_t encode_u8(R &sink, char32_t ch) {
|
||||
std::uint8_t buf[4];
|
||||
std::uint8_t n = detail::u8_encode(buf, ch);
|
||||
for (std::uint8_t i = 0; i < n; ++i) {
|
||||
|
@ -791,6 +801,28 @@ namespace utf {
|
|||
return n;
|
||||
}
|
||||
|
||||
/* @brief Encode a UTF-32 code point into UTF-16.
|
||||
*
|
||||
* The values are written in `sink` which is an ostd::output_range_tag.
|
||||
* The written values are of type `char16_t` and up to 2 are written.
|
||||
* The number of values written is returned from the function. In case
|
||||
* of failure, `0` is returned.
|
||||
*
|
||||
* This function is allowed to fail only in two cases, when a surrogate
|
||||
* code point is provided or when the code point is out of bounds as
|
||||
* defined by Unicode (i.e. 0x10FFFF). It does not throw exceptions
|
||||
* other than those thrown by `sink`.
|
||||
*/
|
||||
template<typename R>
|
||||
inline std::uint8_t encode_u16(R &sink, char32_t ch) {
|
||||
std::uint16_t buf[2];
|
||||
std::uint8_t n = detail::u16_encode(buf, ch);
|
||||
for (std::uint8_t i = 0; i < n; ++i) {
|
||||
sink.put(buf[i]);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* @brief Get the number of Unicode code points in a string.
|
||||
*
|
||||
* This function keeps reading Unicode code points while it can and
|
||||
|
|
|
@ -67,6 +67,31 @@ namespace detail {
|
|||
return true;
|
||||
}
|
||||
|
||||
static inline bool u16_decode(u16string_range &r, char32_t &cret) noexcept {
|
||||
if (r.empty()) {
|
||||
return false;
|
||||
}
|
||||
std::uint32_t ch = r.front();
|
||||
/* lead surrogate code point */
|
||||
if ((ch >= 0xD800) && (ch <= 0xDBFF)) {
|
||||
/* unpaired */
|
||||
if (r.size() < 2) {
|
||||
return false;
|
||||
}
|
||||
std::uint32_t nch = r[1];
|
||||
/* trail surrogate code point */
|
||||
bool trail = ((nch >= 0xDC00) && (nch <= 0xDFFF));
|
||||
if (trail) {
|
||||
r = r.slice(2, r.size());
|
||||
cret = 0x10000 + (((ch - 0xD800) << 10) | (nch - 0xDC00));
|
||||
}
|
||||
return trail;
|
||||
}
|
||||
r.pop_front();
|
||||
cret = ch;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::uint8_t u8_encode(
|
||||
std::uint8_t (&ret)[4], std::uint32_t ch
|
||||
) noexcept {
|
||||
|
@ -100,12 +125,34 @@ namespace detail {
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::uint8_t u16_encode(
|
||||
std::uint16_t (&ret)[2], std::uint32_t ch
|
||||
) noexcept {
|
||||
/* surrogate code point or out of bounds */
|
||||
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > MaxCodepoint)) {
|
||||
return 0;
|
||||
}
|
||||
if (ch <= 0xFFFF) {
|
||||
ret[0] = std::uint16_t(ch);
|
||||
return 1;
|
||||
}
|
||||
/* 20-bit number */
|
||||
ch -= 0x10000;
|
||||
ret[0] = std::uint16_t(0xD800 + (ch >> 10));
|
||||
ret[1] = std::uint16_t(0xDC00 + (ch & 0x3FF));
|
||||
return 2;
|
||||
}
|
||||
} /* namespace detail */
|
||||
|
||||
bool decode(string_range &r, char32_t &ret) noexcept {
|
||||
return detail::u8_decode(r, ret);
|
||||
}
|
||||
|
||||
bool decode(u16string_range &r, char32_t &ret) noexcept {
|
||||
return detail::u16_decode(r, ret);
|
||||
}
|
||||
|
||||
std::size_t length(string_range r, string_range &cont) noexcept {
|
||||
std::size_t ret = 0;
|
||||
for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) {
|
||||
|
|
Loading…
Reference in New Issue