add some initial code for upcoming unicode support

master
Daniel Kolesa 2017-12-31 03:00:23 +01:00
parent f7929a1b45
commit 3c75d7db98
2 changed files with 120 additions and 11 deletions

View File

@ -68,6 +68,44 @@ namespace ostd {
* @{
*/
template<typename T> struct basic_char_range;
/** @brief A mutable slice over `char`. */
using char_range = basic_char_range<char>;
/** @brief An immutable slice over `char`.
*
* This is used in most libostd APIs that read strings. More or less
* anything is convertible to it, including mutable slices, so it's
* a perfect fit as long as modifications are not necessary.
*/
using string_range = basic_char_range<char const>;
namespace utf {
/* @brief Get the number of Unicode code points in a valid UTF-8 string.
*
* If an invalid UTF-8 sequence is encountered, it returns the length
* until that sequence.
*
* If you need to get the continuation string, use the error-handling
* overload of the function.
*/
std::size_t length(string_range r);
/* @brief Get the number of Unicode code points in a string.
*
* This function keeps reading Unicode code points while it can and
* once it can't it returns the number of valid ones with the rest
* of the input string range being in `cont`. That means if the entire
* string is a valid UTF-8 string, `cont` will be empty, otherwise it
* will begin at the first unvalid UTF-8 code point.
*
* If you're sure the string is valid or you don't need to handle the
* error, you can use the more convenient overload above.
*/
std::size_t length(string_range r, string_range &cont);
} /* namespace utf */
/** @brief A string slice type.
*
* This is a contiguous range over a character type. The character type
@ -344,17 +382,6 @@ private:
T *p_beg, *p_end;
};
/** @brief A mutable slice over `char`. */
using char_range = basic_char_range<char>;
/** @brief An immutable slice over `char`.
*
* This is used in most libostd APIs that read strings. More or less
* anything is convertible to it, including mutable slices, so it's
* a perfect fit as long as modifications are not necessary.
*/
using string_range = basic_char_range<char const>;
/* comparisons between ranges */
/** @brief Like `!lhs.compare(rhs)`. */

82
src/string.cc 100644
View File

@ -0,0 +1,82 @@
/* String implementation details, mainly regarding Unicode support.
*
* This file is part of libostd. See COPYING.md for futher information.
*/
#include <cstdint>
#include "ostd/string.hh"
namespace ostd {
namespace utf {
constexpr std::uint32_t MaxCodepoint = 0x10FFFF;
static inline bool is_u8cont(std::uint32_t ch) {
return (ch & 0xC0) == 0x80;
}
static inline bool codepoint(string_range &r, char32_t &cret) {
static const std::uint32_t ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
if (r.empty()) {
return false;
}
std::uint32_t ch = static_cast<unsigned char const>(r.front());
if (ch <= 0x7F) {
/* ASCII */
cret = ch;
r.pop_front();
return true;
}
std::uint32_t ret = 0;
string_range sr = r;
sr.pop_front();
/* continuation bytes */
for (; ch & 0x40; ch <<= 1) {
/* need a continuation byte but nothing left in the string */
if (sr.empty()) {
return false;
}
/* the continuation byte */
std::uint32_t nch = static_cast<unsigned char const>(sr.front());
sr.pop_front();
/* lower 6 bits */
std::uint32_t bch = nch & 0x3F;
/* not a continuation byte */
if ((nch ^ bch) != 0x80) {
return false;
}
/* the 6 bits go in the result */
ret = (ret << 6) | bch;
}
/* number of continuation bytes */
std::size_t n = sr.data() - r.data();
/* invalid sequence - too many continuation bits */
if (n > 3) {
return false;
}
/* add the up to 7 bits from the first byte, already shifted left by n */
ret |= (ch & 0x7F) << (n * 5);
/* invalid sequence - out of bounds */
if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
return false;
}
cret = ret;
r = sr;
return true;
}
std::size_t length(string_range r, string_range &cont) {
std::size_t ret = 0;
for (char32_t ch = U'\0'; codepoint(r, ch); ++ret) {
continue;
}
cont = r;
return ret;
}
std::size_t length(string_range r) {
return length(r, r);
}
} /* namespace utf */
} /* namespace ostd */