add a function to encode utf-32 to utf-8
parent
278b6a6269
commit
0857edfef4
|
@ -744,6 +744,34 @@ namespace utf {
|
|||
*/
|
||||
bool decode(string_range &r, char32_t &ret) noexcept;
|
||||
|
||||
namespace detail {
|
||||
std::uint8_t u8_encode(
|
||||
std::uint8_t (&ret)[4], std::uint32_t ch
|
||||
) noexcept;
|
||||
}
|
||||
|
||||
/* @brief Encode a UTF-32 code point into UTF-8 code units.
|
||||
*
|
||||
* The units are written in `sink` which is an ostd::output_range_tag.
|
||||
* The written values are of type `char` and up to 4 are written. The
|
||||
* number of bytes written is returned from the function. In case of
|
||||
* failure, `0` is returned.
|
||||
*
|
||||
* This function is allowed to fail only in two cases, when a surrogate
|
||||
* code point is provided or when the code point is out of bounds as
|
||||
* defined by Unicode (i.e. 0x10FFFF). It does not throw exceptions
|
||||
* other than those thrown by `sink`.
|
||||
*/
|
||||
template<typename R>
|
||||
std::uint8_t encode_u8(R &sink, char32_t ch) {
|
||||
std::uint8_t buf[4];
|
||||
std::uint8_t n = detail::u8_encode(buf, ch);
|
||||
for (std::uint8_t i = 0; i < n; ++i) {
|
||||
sink.put(buf[i]);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* @brief Get the number of Unicode code points in a string.
|
||||
*
|
||||
* This function keeps reading Unicode code points while it can and
|
||||
|
|
128
src/string.cc
128
src/string.cc
|
@ -11,63 +11,99 @@ namespace utf {
|
|||
|
||||
constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
|
||||
|
||||
static inline bool u8_decode(string_range &r, char32_t &cret) noexcept {
|
||||
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
|
||||
if (r.empty()) {
|
||||
return false;
|
||||
}
|
||||
std::uint32_t ch = static_cast<unsigned char const>(r.front());
|
||||
if (ch <= 0x7F) {
|
||||
/* ASCII */
|
||||
cret = ch;
|
||||
r.pop_front();
|
||||
namespace detail {
|
||||
static inline bool u8_decode(string_range &r, char32_t &cret) noexcept {
|
||||
static std::uint32_t const ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
|
||||
if (r.empty()) {
|
||||
return false;
|
||||
}
|
||||
std::uint32_t ch = static_cast<unsigned char const>(r.front());
|
||||
if (ch <= 0x7F) {
|
||||
/* ASCII */
|
||||
cret = ch;
|
||||
r.pop_front();
|
||||
return true;
|
||||
}
|
||||
std::uint32_t ret = 0;
|
||||
string_range sr = r;
|
||||
sr.pop_front();
|
||||
/* continuation bytes */
|
||||
for (; ch & 0x40; ch <<= 1) {
|
||||
/* need a continuation byte but nothing left in the string */
|
||||
if (sr.empty()) {
|
||||
return false;
|
||||
}
|
||||
/* the continuation byte */
|
||||
std::uint32_t nch = static_cast<unsigned char const>(sr.front());
|
||||
sr.pop_front();
|
||||
/* lower 6 bits */
|
||||
std::uint32_t bch = nch & 0x3F;
|
||||
/* not a continuation byte */
|
||||
if ((nch ^ bch) != 0x80) {
|
||||
return false;
|
||||
}
|
||||
/* the 6 bits go in the result */
|
||||
ret = (ret << 6) | bch;
|
||||
}
|
||||
/* number of continuation bytes */
|
||||
std::size_t n = sr.data() - r.data() - 1;
|
||||
/* invalid sequence - too many continuation bits */
|
||||
if (n > 3) {
|
||||
return false;
|
||||
}
|
||||
/* add the up to 7 bits from the first byte, already shifted left by n */
|
||||
ret |= (ch & 0x7F) << (n * 5);
|
||||
/* invalid sequence - out of bounds */
|
||||
if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
|
||||
return false;
|
||||
}
|
||||
cret = ret;
|
||||
r = sr;
|
||||
return true;
|
||||
}
|
||||
std::uint32_t ret = 0;
|
||||
string_range sr = r;
|
||||
sr.pop_front();
|
||||
/* continuation bytes */
|
||||
for (; ch & 0x40; ch <<= 1) {
|
||||
/* need a continuation byte but nothing left in the string */
|
||||
if (sr.empty()) {
|
||||
return false;
|
||||
|
||||
std::uint8_t u8_encode(
|
||||
std::uint8_t (&ret)[4], std::uint32_t ch
|
||||
) noexcept {
|
||||
if (ch <= 0x7F) {
|
||||
ret[0] = ch;
|
||||
return 1;
|
||||
}
|
||||
/* the continuation byte */
|
||||
std::uint32_t nch = static_cast<unsigned char const>(sr.front());
|
||||
sr.pop_front();
|
||||
/* lower 6 bits */
|
||||
std::uint32_t bch = nch & 0x3F;
|
||||
/* not a continuation byte */
|
||||
if ((nch ^ bch) != 0x80) {
|
||||
return false;
|
||||
if (ch <= 0x7FF) {
|
||||
ret[0] = 0xC0 | (ch >> 6);
|
||||
ret[1] = 0x80 | (ch & 0x3F);
|
||||
return 2;
|
||||
}
|
||||
/* the 6 bits go in the result */
|
||||
ret = (ret << 6) | bch;
|
||||
if (ch <= 0xFFFF) {
|
||||
/* TODO: optional WTF-8 semantics
|
||||
* for now simply reject surrogate code points
|
||||
*/
|
||||
if ((ch & 0xD800) == 0xD800) {
|
||||
return 0;
|
||||
}
|
||||
ret[0] = 0xE0 | (ch >> 12);
|
||||
ret[1] = 0x80 | ((ch >> 6) & 0x3F);
|
||||
ret[2] = 0x80 | (ch & 0x3F);
|
||||
return 3;
|
||||
}
|
||||
if (ch <= MaxCodepoint) {
|
||||
ret[0] = 0xF0 | (ch >> 18);
|
||||
ret[1] = 0x80 | ((ch >> 12) | 0x3F);
|
||||
ret[2] = 0x80 | ((ch >> 6) | 0x3F);
|
||||
ret[3] = 0x80 | (ch | 0x3F);
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
/* number of continuation bytes */
|
||||
std::size_t n = sr.data() - r.data() - 1;
|
||||
/* invalid sequence - too many continuation bits */
|
||||
if (n > 3) {
|
||||
return false;
|
||||
}
|
||||
/* add the up to 7 bits from the first byte, already shifted left by n */
|
||||
ret |= (ch & 0x7F) << (n * 5);
|
||||
/* invalid sequence - out of bounds */
|
||||
if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
|
||||
return false;
|
||||
}
|
||||
cret = ret;
|
||||
r = sr;
|
||||
return true;
|
||||
}
|
||||
} /* namespace detail */
|
||||
|
||||
bool decode(string_range &r, char32_t &ret) noexcept {
|
||||
return u8_decode(r, ret);
|
||||
return detail::u8_decode(r, ret);
|
||||
}
|
||||
|
||||
std::size_t length(string_range r, string_range &cont) noexcept {
|
||||
std::size_t ret = 0;
|
||||
for (char32_t ch = U'\0'; u8_decode(r, ch); ++ret) {
|
||||
for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) {
|
||||
continue;
|
||||
}
|
||||
cont = r;
|
||||
|
|
Loading…
Reference in New Issue