add some initial code for upcoming unicode support
parent
f7929a1b45
commit
3c75d7db98
|
@ -68,6 +68,44 @@ namespace ostd {
|
|||
* @{
|
||||
*/
|
||||
|
||||
template<typename T> struct basic_char_range;
|
||||
|
||||
/** @brief A mutable slice over `char`. */
|
||||
using char_range = basic_char_range<char>;
|
||||
|
||||
/** @brief An immutable slice over `char`.
|
||||
*
|
||||
* This is used in most libostd APIs that read strings. More or less
|
||||
* anything is convertible to it, including mutable slices, so it's
|
||||
* a perfect fit as long as modifications are not necessary.
|
||||
*/
|
||||
using string_range = basic_char_range<char const>;
|
||||
|
||||
namespace utf {
|
||||
/* @brief Get the number of Unicode code points in a valid UTF-8 string.
|
||||
*
|
||||
* If an invalid UTF-8 sequence is encountered, it returns the length
|
||||
* until that sequence.
|
||||
*
|
||||
* If you need to get the continuation string, use the error-handling
|
||||
* overload of the function.
|
||||
*/
|
||||
std::size_t length(string_range r);
|
||||
|
||||
/* @brief Get the number of Unicode code points in a string.
|
||||
*
|
||||
* This function keeps reading Unicode code points while it can and
|
||||
* once it can't it returns the number of valid ones with the rest
|
||||
* of the input string range being in `cont`. That means if the entire
|
||||
* string is a valid UTF-8 string, `cont` will be empty, otherwise it
|
||||
* will begin at the first unvalid UTF-8 code point.
|
||||
*
|
||||
* If you're sure the string is valid or you don't need to handle the
|
||||
* error, you can use the more convenient overload above.
|
||||
*/
|
||||
std::size_t length(string_range r, string_range &cont);
|
||||
} /* namespace utf */
|
||||
|
||||
/** @brief A string slice type.
|
||||
*
|
||||
* This is a contiguous range over a character type. The character type
|
||||
|
@ -344,17 +382,6 @@ private:
|
|||
T *p_beg, *p_end;
|
||||
};
|
||||
|
||||
/** @brief A mutable slice over `char`. */
|
||||
using char_range = basic_char_range<char>;
|
||||
|
||||
/** @brief An immutable slice over `char`.
|
||||
*
|
||||
* This is used in most libostd APIs that read strings. More or less
|
||||
* anything is convertible to it, including mutable slices, so it's
|
||||
* a perfect fit as long as modifications are not necessary.
|
||||
*/
|
||||
using string_range = basic_char_range<char const>;
|
||||
|
||||
/* comparisons between ranges */
|
||||
|
||||
/** @brief Like `!lhs.compare(rhs)`. */
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
/* String implementation details, mainly regarding Unicode support.
|
||||
*
|
||||
* This file is part of libostd. See COPYING.md for futher information.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include "ostd/string.hh"
|
||||
|
||||
namespace ostd {
|
||||
namespace utf {
|
||||
|
||||
constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
|
||||
|
||||
static inline bool is_u8cont(std::uint32_t ch) {
|
||||
return (ch & 0xC0) == 0x80;
|
||||
}
|
||||
|
||||
static inline bool codepoint(string_range &r, char32_t &cret) {
|
||||
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
|
||||
if (r.empty()) {
|
||||
return false;
|
||||
}
|
||||
std::uint32_t ch = static_cast<unsigned char const>(r.front());
|
||||
if (ch <= 0x7F) {
|
||||
/* ASCII */
|
||||
cret = ch;
|
||||
r.pop_front();
|
||||
return true;
|
||||
}
|
||||
std::uint32_t ret = 0;
|
||||
string_range sr = r;
|
||||
sr.pop_front();
|
||||
/* continuation bytes */
|
||||
for (; ch & 0x40; ch <<= 1) {
|
||||
/* need a continuation byte but nothing left in the string */
|
||||
if (sr.empty()) {
|
||||
return false;
|
||||
}
|
||||
/* the continuation byte */
|
||||
std::uint32_t nch = static_cast<unsigned char const>(sr.front());
|
||||
sr.pop_front();
|
||||
/* lower 6 bits */
|
||||
std::uint32_t bch = nch & 0x3F;
|
||||
/* not a continuation byte */
|
||||
if ((nch ^ bch) != 0x80) {
|
||||
return false;
|
||||
}
|
||||
/* the 6 bits go in the result */
|
||||
ret = (ret << 6) | bch;
|
||||
}
|
||||
/* number of continuation bytes */
|
||||
std::size_t n = sr.data() - r.data();
|
||||
/* invalid sequence - too many continuation bits */
|
||||
if (n > 3) {
|
||||
return false;
|
||||
}
|
||||
/* add the up to 7 bits from the first byte, already shifted left by n */
|
||||
ret |= (ch & 0x7F) << (n * 5);
|
||||
/* invalid sequence - out of bounds */
|
||||
if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
|
||||
return false;
|
||||
}
|
||||
cret = ret;
|
||||
r = sr;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::size_t length(string_range r, string_range &cont) {
|
||||
std::size_t ret = 0;
|
||||
for (char32_t ch = U'\0'; codepoint(r, ch); ++ret) {
|
||||
continue;
|
||||
}
|
||||
cont = r;
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::size_t length(string_range r) {
|
||||
return length(r, r);
|
||||
}
|
||||
|
||||
} /* namespace utf */
|
||||
} /* namespace ostd */
|
Loading…
Reference in New Issue