expose multibyte-to-codepoint conversion
This commit is contained in:
parent
3c75d7db98
commit
d6a13d8f97
|
@ -82,15 +82,12 @@ using char_range = basic_char_range<char>;
|
||||||
using string_range = basic_char_range<char const>;
|
using string_range = basic_char_range<char const>;
|
||||||
|
|
||||||
namespace utf {
|
namespace utf {
|
||||||
/* @brief Get the number of Unicode code points in a valid UTF-8 string.
|
/* @brief Get the Unicode code point for a multibyte sequence.
|
||||||
*
|
*
|
||||||
* If an invalid UTF-8 sequence is encountered, it returns the length
|
* The string is advanced past the UTF-8 character in the front.
|
||||||
* until that sequence.
|
* If the decoding fails, `false` is returned, otherwise it's `true`.
|
||||||
*
|
|
||||||
* If you need to get the continuation string, use the error-handling
|
|
||||||
* overload of the function.
|
|
||||||
*/
|
*/
|
||||||
std::size_t length(string_range r);
|
bool codepoint(string_range &r, char32_t &ret);
|
||||||
|
|
||||||
/* @brief Get the number of Unicode code points in a string.
|
/* @brief Get the number of Unicode code points in a string.
|
||||||
*
|
*
|
||||||
|
@ -98,12 +95,22 @@ namespace utf {
|
||||||
* once it can't it returns the number of valid ones with the rest
|
* once it can't it returns the number of valid ones with the rest
|
||||||
* of the input string range being in `cont`. That means if the entire
|
* of the input string range being in `cont`. That means if the entire
|
||||||
* string is a valid UTF-8 string, `cont` will be empty, otherwise it
|
* string is a valid UTF-8 string, `cont` will be empty, otherwise it
|
||||||
* will begin at the first unvalid UTF-8 code point.
|
* will begin at the first invalid UTF-8 code point.
|
||||||
*
|
*
|
||||||
* If you're sure the string is valid or you don't need to handle the
|
* If you're sure the string is valid or you don't need to handle the
|
||||||
* error, you can use the more convenient overload above.
|
* error, you can use the more convenient overload below.
|
||||||
*/
|
*/
|
||||||
std::size_t length(string_range r, string_range &cont);
|
std::size_t length(string_range r, string_range &cont);
|
||||||
|
|
||||||
|
/* @brief Get the number of Unicode code points in a valid UTF-8 string.
|
||||||
|
*
|
||||||
|
* If an invalid UTF-8 sequence is encountered, it returns the length
|
||||||
|
* until that sequence.
|
||||||
|
*
|
||||||
|
* If you need to get the continuation string, use the general
|
||||||
|
* error-handling overload of the function.
|
||||||
|
*/
|
||||||
|
std::size_t length(string_range r);
|
||||||
} /* namespace utf */
|
} /* namespace utf */
|
||||||
|
|
||||||
/** @brief A string slice type.
|
/** @brief A string slice type.
|
||||||
|
|
|
@ -11,11 +11,7 @@ namespace utf {
|
||||||
|
|
||||||
constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
|
constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
|
||||||
|
|
||||||
static inline bool is_u8cont(std::uint32_t ch) {
|
static inline bool codepoint_dec(string_range &r, char32_t &cret) {
|
||||||
return (ch & 0xC0) == 0x80;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool codepoint(string_range &r, char32_t &cret) {
|
|
||||||
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
|
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
|
||||||
if (r.empty()) {
|
if (r.empty()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -65,9 +61,13 @@ static inline bool codepoint(string_range &r, char32_t &cret) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool codepoint(string_range &r, char32_t &ret) {
|
||||||
|
return codepoint_dec(r, ret);
|
||||||
|
}
|
||||||
|
|
||||||
std::size_t length(string_range r, string_range &cont) {
|
std::size_t length(string_range r, string_range &cont) {
|
||||||
std::size_t ret = 0;
|
std::size_t ret = 0;
|
||||||
for (char32_t ch = U'\0'; codepoint(r, ch); ++ret) {
|
for (char32_t ch = U'\0'; codepoint_dec(r, ch); ++ret) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cont = r;
|
cont = r;
|
||||||
|
|
Loading…
Reference in a new issue