various traits and constants for unicode types

master
Daniel Kolesa 2018-01-07 17:13:53 +01:00
parent be803bac7b
commit 24d1b5ec25
2 changed files with 96 additions and 47 deletions

View File

@ -629,6 +629,85 @@ namespace utf {
* @{
*/
using wchar_fixed_t = std::conditional_t<
sizeof(wchar_t) == sizeof(char32_t),
char32_t,
std::conditional_t<
sizeof(wchar_t) == sizeof(char16_t),
char16_t,
char
>
>;
namespace detail {
template<typename C>
struct max_units_base;
template<>
struct max_units_base<char32_t> {
static constexpr std::size_t const value = 1;
};
template<>
struct max_units_base<char16_t> {
static constexpr std::size_t const value = 2;
};
template<>
struct max_units_base<char> {
static constexpr std::size_t const value = 4;
};
template<>
struct max_units_base<wchar_t>: max_units_base<wchar_fixed_t> {};
} /* namespace detail */
template<typename C>
static inline constexpr std::size_t const max_units =
detail::max_units_base<C>::value;
static inline constexpr bool const is_wchar_u32 =
std::is_same_v<wchar_fixed_t, char32_t>;
static inline constexpr bool const is_wchar_u16 =
std::is_same_v<wchar_fixed_t, char16_t>;
static inline constexpr bool const is_wchar_u8 =
std::is_same_v<wchar_fixed_t, char>;
namespace detail {
template<typename C>
struct is_character_base {
static constexpr bool const value = false;
};
template<>
struct is_character_base<char> {
static constexpr bool const value = true;
};
template<>
struct is_character_base<char16_t> {
static constexpr bool const value = true;
};
template<>
struct is_character_base<char32_t> {
static constexpr bool const value = true;
};
template<>
struct is_character_base<wchar_t> {
static constexpr bool const value = true;
};
}
template<typename C>
static inline constexpr bool const is_character =
detail::is_character_base<C>::value;
static inline constexpr char32_t const max_unicode = 0x10FFFF;
/** @brief Thrown on UTF-8 decoding failure. */
struct utf_error: std::runtime_error {
using std::runtime_error::runtime_error;
@ -739,10 +818,7 @@ namespace utf {
* actually decodes; in both cases it encodes to utf-8,
* for utf-8 the whole thing is just an advancing wrapper
*/
if constexpr(
(sizeof(wchar_t) == sizeof(char32_t)) ||
(sizeof(wchar_t) == sizeof(char16_t))
) {
if constexpr(is_wchar_u32 || is_wchar_u16) {
auto rr = r;
if (char32_t ch; utf::decode(rr, ch)) {
if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
@ -821,10 +897,7 @@ namespace utf {
/* when wchar_t is guaranteed utf-16, we have an identity
* match so we just advance; otherwise decode and encode
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) == sizeof(char16_t))
) {
if constexpr(is_wchar_u16) {
if (!r.empty()) {
sink.put(char16_t(r.front()));
r.pop_front();
@ -874,10 +947,10 @@ namespace utf {
template<typename R>
inline std::size_t encode_uw(R &sink, char32_t ch) {
std::size_t n;
if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
if constexpr(is_wchar_u32) {
n = 1;
sink.put(wchar_t(ch));
} else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
} else if constexpr(is_wchar_u16) {
char16_t buf[2];
n = detail::u16_encode(buf, ch);
for (std::size_t i = 0; i < n; ++i) {
@ -909,10 +982,7 @@ namespace utf {
* match much like encode_u16 with wstring, otherwise
* decode and encode
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) == sizeof(char16_t))
) {
if constexpr(is_wchar_u16) {
if (!r.empty()) {
sink.put(wchar_t(r.front()));
r.pop_front();
@ -936,10 +1006,7 @@ namespace utf {
* match so there is no reencoding, otherwise decode and
* encode...
*/
if constexpr(
(sizeof(wchar_t) != sizeof(char32_t)) &&
(sizeof(wchar_t) != sizeof(char16_t))
) {
if constexpr(is_wchar_u8) {
if (!r.empty()) {
sink.put(wchar_t(r.front()));
r.pop_front();
@ -972,12 +1039,7 @@ namespace utf {
inline std::size_t encode(
[[maybe_unused]] OR &sink, [[maybe_unused]] IR &r
) {
static_assert(
std::is_same_v<C, char32_t> ||
std::is_same_v<C, char16_t> ||
std::is_same_v<C, char> ||
std::is_same_v<C, wchar_t>, "Invalid input type"
);
static_assert(is_character<C>, "Invalid input type");
if constexpr(std::is_same_v<C, char32_t>) {
return encode_u32(sink, r);
} else if constexpr(std::is_same_v<C, char16_t>) {
@ -1059,7 +1121,7 @@ namespace utf {
private:
void advance() {
auto r = basic_char_range<OC>(p_buf, p_buf + sizeof(p_buf));
auto r = basic_char_range<OC>(p_buf, p_buf + max_units<OC>);
if (std::size_t n; !(n = utf::encode<OC>(r, p_range))) {
/* range is unchanged */
p_left = basic_char_range<OC>{};
@ -1071,7 +1133,7 @@ namespace utf {
basic_char_range<IC const> p_range;
basic_char_range<OC> p_left{};
OC p_buf[4];
OC p_buf[max_units<OC>];
};
} /* namespace detail */

View File

@ -14,11 +14,9 @@ namespace utf {
/* place the vtable in here */
utf_error::~utf_error() {}
constexpr char32_t MaxCodepoint = 0x10FFFF;
namespace detail {
inline bool is_invalid_u32(char32_t c) {
return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > MaxCodepoint));
return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > utf::max_unicode));
}
static inline std::size_t u8_decode(
@ -118,7 +116,7 @@ namespace detail {
ret[2] = char(0x80 | (ch & 0x3F));
return 3;
}
if (ch <= MaxCodepoint) {
if (ch <= utf::max_unicode) {
ret[0] = char(0xF0 | (ch >> 18));
ret[1] = char(0x80 | ((ch >> 12) | 0x3F));
ret[2] = char(0x80 | ((ch >> 6) | 0x3F));
@ -132,7 +130,7 @@ namespace detail {
char16_t (&ret)[2], char32_t ch
) noexcept {
/* surrogate code point or out of bounds */
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > MaxCodepoint)) {
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > utf::max_unicode)) {
return 0;
}
if (ch <= 0xFFFF) {
@ -161,10 +159,7 @@ namespace detail {
template<typename C>
inline std::size_t length(basic_char_range<C const> &r) noexcept {
std::size_t ret = 0;
if constexpr(std::is_same_v<C, char32_t> || (
std::is_same_v<C, wchar_t> &&
(sizeof(wchar_t) == sizeof(char32_t))
)) {
if constexpr(utf::max_units<C> == 1) {
ret = r.size();
} else {
for (;; ++ret) {
@ -215,7 +210,7 @@ bool decode(u32string_range &r, char32_t &ret) noexcept {
bool decode(wstring_range &r, char32_t &ret) noexcept {
std::size_t n, tn = r.size();
if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
if constexpr(is_wchar_u32) {
if (!tn) {
return false;
}
@ -226,7 +221,7 @@ bool decode(wstring_range &r, char32_t &ret) noexcept {
ret = c;
r.pop_front();
return true;
} else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
} else if constexpr(is_wchar_u16) {
auto *beg = reinterpret_cast<char16_t const *>(r.data());
n = detail::u16_decode(beg, beg + tn, ret);
} else {
@ -319,7 +314,7 @@ bool isvalid(char32_t c) noexcept {
return false;
}
/* must be within range */
return (c <= MaxCodepoint);
return (c <= utf::max_unicode);
}
bool isxdigit(char32_t c) noexcept {
@ -567,15 +562,7 @@ int case_compare(u32string_range s1, u32string_range s2) noexcept {
}
int case_compare(wstring_range s1, wstring_range s2) noexcept {
using C = std::conditional_t<
sizeof(wchar_t) == sizeof(char32_t),
char32_t,
std::conditional_t<
sizeof(wchar_t) == sizeof(char16_t),
char16_t,
unsigned char
>
>;
using C = std::conditional_t<is_wchar_u8, unsigned char, wchar_fixed_t>;
auto *beg1 = reinterpret_cast<C const *>(s1.data());
auto *beg2 = reinterpret_cast<C const *>(s2.data());
return detail::case_compare(beg1, beg1 + s1.size(), beg2, beg2 + s2.size());