various traits and constants for unicode types
parent
be803bac7b
commit
24d1b5ec25
114
ostd/string.hh
114
ostd/string.hh
|
@ -629,6 +629,85 @@ namespace utf {
|
|||
* @{
|
||||
*/
|
||||
|
||||
using wchar_fixed_t = std::conditional_t<
|
||||
sizeof(wchar_t) == sizeof(char32_t),
|
||||
char32_t,
|
||||
std::conditional_t<
|
||||
sizeof(wchar_t) == sizeof(char16_t),
|
||||
char16_t,
|
||||
char
|
||||
>
|
||||
>;
|
||||
|
||||
namespace detail {
|
||||
template<typename C>
|
||||
struct max_units_base;
|
||||
|
||||
template<>
|
||||
struct max_units_base<char32_t> {
|
||||
static constexpr std::size_t const value = 1;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct max_units_base<char16_t> {
|
||||
static constexpr std::size_t const value = 2;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct max_units_base<char> {
|
||||
static constexpr std::size_t const value = 4;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct max_units_base<wchar_t>: max_units_base<wchar_fixed_t> {};
|
||||
} /* namespace detail */
|
||||
|
||||
template<typename C>
|
||||
static inline constexpr std::size_t const max_units =
|
||||
detail::max_units_base<C>::value;
|
||||
|
||||
static inline constexpr bool const is_wchar_u32 =
|
||||
std::is_same_v<wchar_fixed_t, char32_t>;
|
||||
|
||||
static inline constexpr bool const is_wchar_u16 =
|
||||
std::is_same_v<wchar_fixed_t, char16_t>;
|
||||
|
||||
static inline constexpr bool const is_wchar_u8 =
|
||||
std::is_same_v<wchar_fixed_t, char>;
|
||||
|
||||
namespace detail {
|
||||
template<typename C>
|
||||
struct is_character_base {
|
||||
static constexpr bool const value = false;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct is_character_base<char> {
|
||||
static constexpr bool const value = true;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct is_character_base<char16_t> {
|
||||
static constexpr bool const value = true;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct is_character_base<char32_t> {
|
||||
static constexpr bool const value = true;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct is_character_base<wchar_t> {
|
||||
static constexpr bool const value = true;
|
||||
};
|
||||
}
|
||||
|
||||
template<typename C>
|
||||
static inline constexpr bool const is_character =
|
||||
detail::is_character_base<C>::value;
|
||||
|
||||
static inline constexpr char32_t const max_unicode = 0x10FFFF;
|
||||
|
||||
/** @brief Thrown on UTF-8 decoding failure. */
|
||||
struct utf_error: std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
|
@ -739,10 +818,7 @@ namespace utf {
|
|||
* actually decodes; in both cases it encodes to utf-8,
|
||||
* for utf-8 the whole thing is just an advancing wrapper
|
||||
*/
|
||||
if constexpr(
|
||||
(sizeof(wchar_t) == sizeof(char32_t)) ||
|
||||
(sizeof(wchar_t) == sizeof(char16_t))
|
||||
) {
|
||||
if constexpr(is_wchar_u32 || is_wchar_u16) {
|
||||
auto rr = r;
|
||||
if (char32_t ch; utf::decode(rr, ch)) {
|
||||
if (std::size_t ret; (ret = utf::encode_u8(sink, ch))) {
|
||||
|
@ -821,10 +897,7 @@ namespace utf {
|
|||
/* when wchar_t is guaranteed utf-16, we have an identity
|
||||
* match so we just advance; otherwise decode and encode
|
||||
*/
|
||||
if constexpr(
|
||||
(sizeof(wchar_t) != sizeof(char32_t)) &&
|
||||
(sizeof(wchar_t) == sizeof(char16_t))
|
||||
) {
|
||||
if constexpr(is_wchar_u16) {
|
||||
if (!r.empty()) {
|
||||
sink.put(char16_t(r.front()));
|
||||
r.pop_front();
|
||||
|
@ -874,10 +947,10 @@ namespace utf {
|
|||
template<typename R>
|
||||
inline std::size_t encode_uw(R &sink, char32_t ch) {
|
||||
std::size_t n;
|
||||
if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
|
||||
if constexpr(is_wchar_u32) {
|
||||
n = 1;
|
||||
sink.put(wchar_t(ch));
|
||||
} else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
|
||||
} else if constexpr(is_wchar_u16) {
|
||||
char16_t buf[2];
|
||||
n = detail::u16_encode(buf, ch);
|
||||
for (std::size_t i = 0; i < n; ++i) {
|
||||
|
@ -909,10 +982,7 @@ namespace utf {
|
|||
* match much like encode_u16 with wstring, otherwise
|
||||
* decode and encode
|
||||
*/
|
||||
if constexpr(
|
||||
(sizeof(wchar_t) != sizeof(char32_t)) &&
|
||||
(sizeof(wchar_t) == sizeof(char16_t))
|
||||
) {
|
||||
if constexpr(is_wchar_u16) {
|
||||
if (!r.empty()) {
|
||||
sink.put(wchar_t(r.front()));
|
||||
r.pop_front();
|
||||
|
@ -936,10 +1006,7 @@ namespace utf {
|
|||
* match so there is no reencoding, otherwise decode and
|
||||
* encode...
|
||||
*/
|
||||
if constexpr(
|
||||
(sizeof(wchar_t) != sizeof(char32_t)) &&
|
||||
(sizeof(wchar_t) != sizeof(char16_t))
|
||||
) {
|
||||
if constexpr(is_wchar_u8) {
|
||||
if (!r.empty()) {
|
||||
sink.put(wchar_t(r.front()));
|
||||
r.pop_front();
|
||||
|
@ -972,12 +1039,7 @@ namespace utf {
|
|||
inline std::size_t encode(
|
||||
[[maybe_unused]] OR &sink, [[maybe_unused]] IR &r
|
||||
) {
|
||||
static_assert(
|
||||
std::is_same_v<C, char32_t> ||
|
||||
std::is_same_v<C, char16_t> ||
|
||||
std::is_same_v<C, char> ||
|
||||
std::is_same_v<C, wchar_t>, "Invalid input type"
|
||||
);
|
||||
static_assert(is_character<C>, "Invalid input type");
|
||||
if constexpr(std::is_same_v<C, char32_t>) {
|
||||
return encode_u32(sink, r);
|
||||
} else if constexpr(std::is_same_v<C, char16_t>) {
|
||||
|
@ -1059,7 +1121,7 @@ namespace utf {
|
|||
|
||||
private:
|
||||
void advance() {
|
||||
auto r = basic_char_range<OC>(p_buf, p_buf + sizeof(p_buf));
|
||||
auto r = basic_char_range<OC>(p_buf, p_buf + max_units<OC>);
|
||||
if (std::size_t n; !(n = utf::encode<OC>(r, p_range))) {
|
||||
/* range is unchanged */
|
||||
p_left = basic_char_range<OC>{};
|
||||
|
@ -1071,7 +1133,7 @@ namespace utf {
|
|||
|
||||
basic_char_range<IC const> p_range;
|
||||
basic_char_range<OC> p_left{};
|
||||
OC p_buf[4];
|
||||
OC p_buf[max_units<OC>];
|
||||
};
|
||||
} /* namespace detail */
|
||||
|
||||
|
|
|
@ -14,11 +14,9 @@ namespace utf {
|
|||
/* place the vtable in here */
|
||||
utf_error::~utf_error() {}
|
||||
|
||||
constexpr char32_t MaxCodepoint = 0x10FFFF;
|
||||
|
||||
namespace detail {
|
||||
inline bool is_invalid_u32(char32_t c) {
|
||||
return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > MaxCodepoint));
|
||||
return (((c >= 0xD800) && (c <= 0xDFFF)) || (c > utf::max_unicode));
|
||||
}
|
||||
|
||||
static inline std::size_t u8_decode(
|
||||
|
@ -118,7 +116,7 @@ namespace detail {
|
|||
ret[2] = char(0x80 | (ch & 0x3F));
|
||||
return 3;
|
||||
}
|
||||
if (ch <= MaxCodepoint) {
|
||||
if (ch <= utf::max_unicode) {
|
||||
ret[0] = char(0xF0 | (ch >> 18));
|
||||
ret[1] = char(0x80 | ((ch >> 12) | 0x3F));
|
||||
ret[2] = char(0x80 | ((ch >> 6) | 0x3F));
|
||||
|
@ -132,7 +130,7 @@ namespace detail {
|
|||
char16_t (&ret)[2], char32_t ch
|
||||
) noexcept {
|
||||
/* surrogate code point or out of bounds */
|
||||
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > MaxCodepoint)) {
|
||||
if (((ch >= 0xD800) && (ch <= 0xDFFF)) || (ch > utf::max_unicode)) {
|
||||
return 0;
|
||||
}
|
||||
if (ch <= 0xFFFF) {
|
||||
|
@ -161,10 +159,7 @@ namespace detail {
|
|||
template<typename C>
|
||||
inline std::size_t length(basic_char_range<C const> &r) noexcept {
|
||||
std::size_t ret = 0;
|
||||
if constexpr(std::is_same_v<C, char32_t> || (
|
||||
std::is_same_v<C, wchar_t> &&
|
||||
(sizeof(wchar_t) == sizeof(char32_t))
|
||||
)) {
|
||||
if constexpr(utf::max_units<C> == 1) {
|
||||
ret = r.size();
|
||||
} else {
|
||||
for (;; ++ret) {
|
||||
|
@ -215,7 +210,7 @@ bool decode(u32string_range &r, char32_t &ret) noexcept {
|
|||
|
||||
bool decode(wstring_range &r, char32_t &ret) noexcept {
|
||||
std::size_t n, tn = r.size();
|
||||
if constexpr(sizeof(wchar_t) == sizeof(char32_t)) {
|
||||
if constexpr(is_wchar_u32) {
|
||||
if (!tn) {
|
||||
return false;
|
||||
}
|
||||
|
@ -226,7 +221,7 @@ bool decode(wstring_range &r, char32_t &ret) noexcept {
|
|||
ret = c;
|
||||
r.pop_front();
|
||||
return true;
|
||||
} else if constexpr(sizeof(wchar_t) == sizeof(char16_t)) {
|
||||
} else if constexpr(is_wchar_u16) {
|
||||
auto *beg = reinterpret_cast<char16_t const *>(r.data());
|
||||
n = detail::u16_decode(beg, beg + tn, ret);
|
||||
} else {
|
||||
|
@ -319,7 +314,7 @@ bool isvalid(char32_t c) noexcept {
|
|||
return false;
|
||||
}
|
||||
/* must be within range */
|
||||
return (c <= MaxCodepoint);
|
||||
return (c <= utf::max_unicode);
|
||||
}
|
||||
|
||||
bool isxdigit(char32_t c) noexcept {
|
||||
|
@ -567,15 +562,7 @@ int case_compare(u32string_range s1, u32string_range s2) noexcept {
|
|||
}
|
||||
|
||||
int case_compare(wstring_range s1, wstring_range s2) noexcept {
|
||||
using C = std::conditional_t<
|
||||
sizeof(wchar_t) == sizeof(char32_t),
|
||||
char32_t,
|
||||
std::conditional_t<
|
||||
sizeof(wchar_t) == sizeof(char16_t),
|
||||
char16_t,
|
||||
unsigned char
|
||||
>
|
||||
>;
|
||||
using C = std::conditional_t<is_wchar_u8, unsigned char, wchar_fixed_t>;
|
||||
auto *beg1 = reinterpret_cast<C const *>(s1.data());
|
||||
auto *beg2 = reinterpret_cast<C const *>(s2.data());
|
||||
return detail::case_compare(beg1, beg1 + s1.size(), beg2, beg2 + s2.size());
|
||||
|
|
Loading…
Reference in New Issue