expose multibyte-to-codepoint conversion

master
Daniel Kolesa 2017-12-31 03:26:15 +01:00
parent 3c75d7db98
commit d6a13d8f97
2 changed files with 22 additions and 15 deletions

View File

@ -82,15 +82,12 @@ using char_range = basic_char_range<char>;
using string_range = basic_char_range<char const>; using string_range = basic_char_range<char const>;
namespace utf { namespace utf {
/* @brief Get the number of Unicode code points in a valid UTF-8 string. /* @brief Get the Unicode code point for a multibyte sequence.
* *
* If an invalid UTF-8 sequence is encountered, it returns the length * The string is advanced past the UTF-8 character in the front.
* until that sequence. * If the decoding fails, `false` is returned, otherwise it's `true`.
*
* If you need to get the continuation string, use the error-handling
* overload of the function.
*/ */
std::size_t length(string_range r); bool codepoint(string_range &r, char32_t &ret);
/* @brief Get the number of Unicode code points in a string. /* @brief Get the number of Unicode code points in a string.
* *
@ -98,12 +95,22 @@ namespace utf {
* once it can't it returns the number of valid ones with the rest * once it can't it returns the number of valid ones with the rest
* of the input string range being in `cont`. That means if the entire * of the input string range being in `cont`. That means if the entire
* string is a valid UTF-8 string, `cont` will be empty, otherwise it * string is a valid UTF-8 string, `cont` will be empty, otherwise it
* will begin at the first unvalid UTF-8 code point. * will begin at the first invalid UTF-8 code point.
* *
* If you're sure the string is valid or you don't need to handle the * If you're sure the string is valid or you don't need to handle the
* error, you can use the more convenient overload above. * error, you can use the more convenient overload below.
*/ */
std::size_t length(string_range r, string_range &cont); std::size_t length(string_range r, string_range &cont);
/* @brief Get the number of Unicode code points in a valid UTF-8 string.
*
* If an invalid UTF-8 sequence is encountered, it returns the length
* until that sequence.
*
* If you need to get the continuation string, use the general
* error-handling overload of the function.
*/
std::size_t length(string_range r);
} /* namespace utf */ } /* namespace utf */
/** @brief A string slice type. /** @brief A string slice type.

View File

@ -11,11 +11,7 @@ namespace utf {
constexpr std::uint32_t MaxCodepoint = 0x10FFFF; constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
static inline bool is_u8cont(std::uint32_t ch) { static inline bool codepoint_dec(string_range &r, char32_t &cret) {
return (ch & 0xC0) == 0x80;
}
static inline bool codepoint(string_range &r, char32_t &cret) {
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF }; static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
if (r.empty()) { if (r.empty()) {
return false; return false;
@ -65,9 +61,13 @@ static inline bool codepoint(string_range &r, char32_t &cret) {
return true; return true;
} }
bool codepoint(string_range &r, char32_t &ret) {
return codepoint_dec(r, ret);
}
std::size_t length(string_range r, string_range &cont) { std::size_t length(string_range r, string_range &cont) {
std::size_t ret = 0; std::size_t ret = 0;
for (char32_t ch = U'\0'; codepoint(r, ch); ++ret) { for (char32_t ch = U'\0'; codepoint_dec(r, ch); ++ret) {
continue; continue;
} }
cont = r; cont = r;