direct encoding funcs to u8/u16/uw from any other UTF

master
Daniel Kolesa 2018-01-05 21:49:00 +01:00
parent daea42666e
commit 58ccfbe276
2 changed files with 172 additions and 48 deletions

View File

@ -1022,52 +1022,6 @@ private:
}
}
template<typename R>
void write_seq(R &writer, string_range &val) const {
writer.put(val.front());
val.pop_front();
}
template<typename R>
void write_seq(R &writer, u16string_range &val) const {
if (char32_t ret; !utf::decode(val, ret)) {
write_replacement(writer);
val.pop_front();
} else {
if (!utf::encode_u8(writer, ret)) {
write_replacement(writer);
}
}
}
template<typename R>
void write_seq(R &writer, u32string_range &val) const {
if (!utf::encode_u8(writer, val.front())) {
write_replacement(writer);
}
val.pop_front();
}
template<typename R>
void write_seq(R &writer, wstring_range &val) const {
if constexpr(
(sizeof(wchar_t) == sizeof(char32_t)) ||
(sizeof(wchar_t) == sizeof(char16_t))
) {
if (char32_t ret; !utf::decode(val, ret)) {
write_replacement(writer);
val.pop_front();
} else {
if (!utf::encode_u8(writer, ret)) {
write_replacement(writer);
}
}
} else {
writer.put(char(val.front()));
val.pop_front();
}
}
/* string base writer */
template<typename C, typename R>
void write_str(R &writer, bool escape, basic_char_range<C const> val) const {
@ -1091,8 +1045,9 @@ private:
write_char_raw(writer, c);
}
val.pop_front();
} else {
write_seq(writer, val);
} else if (!utf::encode_u8(writer, val)) {
write_replacement(writer);
val.pop_front();
}
}
writer.put('"');

View File

@ -820,6 +820,7 @@ namespace utf {
template<typename R>
inline std::size_t encode_u8(R &sink, u32string_range &r) {
/* just a wrapper; does the same thing but advances */
std::size_t n = 0;
if (!r.empty() && (n = utf::encode_u8(sink, r.front()))) {
r.pop_front();
@ -827,6 +828,57 @@ namespace utf {
return n;
}
template<typename R>
inline std::size_t encode_u8(R &sink, u16string_range &r) {
/* decodes to code point and encodes */
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
r = rr;
return ret;
}
}
return 0;
}
template<typename R>
inline std::size_t encode_u8(R &sink, string_range &r) {
/* identity match, advances */
if (!r.empty()) {
sink.put(r.front());
r.pop_front();
return 1;
}
return 0;
}
template<typename R>
inline std::size_t encode_u8(R &sink, wstring_range &r) {
/* for utf-32, decode is just a swapper, for utf-16 it
* actually decodes; in both cases it encodes to utf-8,
* for utf-8 the whole thing is just an advancing wrapper
*/
if constexpr(
(sizeof(wchar_t) == sizeof(char32_t)) ||
(sizeof(wchar_t) == sizeof(char16_t))
) {
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
r = rr;
return ret;
}
}
} else {
if (!r.empty()) {
sink.put(char(r.front()));
r.pop_front();
return 1;
}
}
return 0;
}
/* @brief Encode a UTF-32 code point into UTF-16.
*
* The values are written in `sink` which is an ostd::output_range_tag.
@ -851,6 +903,7 @@ namespace utf {
template<typename R>
inline std::size_t encode_u16(R &sink, u32string_range &r) {
/* just a wrapper; does the same thing but advances */
std::size_t n = 0;
if (!r.empty() && (n = utf::encode_u16(sink, r.front()))) {
r.pop_front();
@ -858,6 +911,56 @@ namespace utf {
return n;
}
template<typename R>
inline std::size_t encode_u16(R &sink, u16string_range &r) {
/* identity match, advances */
if (!r.empty()) {
sink.put(r.front());
r.pop_front();
return 1;
}
return 0;
}
template<typename R>
inline std::size_t encode_u16(R &sink, string_range &r) {
/* has to decode and encode */
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_u16(sink, ch))) {
r = rr;
return ret;
}
}
return 0;
}
template<typename R>
inline std::size_t encode_u16(R &sink, wstring_range &r) {
/* when wchar_t is guaranteed utf-16, we have an identity
* match so we just advance; otherwise decode and encode
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) == sizeof(char16_t))
) {
if (!r.empty()) {
sink.put(char16_t(r.front()));
r.pop_front();
return 1;
}
} else {
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_u16(sink, ch))) {
r = rr;
return ret;
}
}
}
return 0;
}
/* @brief Encode a UTF-32 code point into a wide Unicode char/sequence.
*
* The value(s) are written in `sink` which is an ostd::output_range_tag.
@ -902,6 +1005,7 @@ namespace utf {
template<typename R>
inline std::size_t encode_uw(R &sink, u32string_range &r) {
/* just a wrapper; does the same thing but advances */
std::size_t n = 0;
if (!r.empty() && (n = utf::encode_uw(sink, r.front()))) {
r.pop_front();
@ -909,6 +1013,71 @@ namespace utf {
return n;
}
template<typename R>
inline std::size_t encode_uw(R &sink, u16string_range &r) {
/* when wchar_t is guaranteed utf-16, we have an identity
* match much like encode_u16 with wstring, otherwise
* decode and encode
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) == sizeof(char16_t))
) {
if (!r.empty()) {
sink.put(wchar_t(r.front()));
r.pop_front();
return 1;
}
} else {
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_uw(sink, ch))) {
r = rr;
return ret;
}
}
}
return 0;
}
template<typename R>
inline std::size_t encode_uw(R &sink, string_range &r) {
/* when wchar_t is guaranteed utf-8, we have an identity
* match so there is no reencoding, otherwise decode and
* encode...
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) != sizeof(char16_t))
) {
if (!r.empty()) {
sink.put(wchar_t(r.front()));
r.pop_front();
return 1;
}
} else {
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_uw(sink, ch))) {
r = rr;
return ret;
}
}
}
return 0;
}
template<typename R>
inline std::size_t encode_uw(R &sink, wstring_range &r) {
/* identity match, advances */
if (!r.empty()) {
sink.put(wchar_t(r.front()));
r.pop_front();
return 1;
}
return 0;
}
/* @brief Get the number of Unicode code points in a string.
*
* This function keeps reading Unicode code points while it can and