/** @defgroup Strings * * @brief Provides string processing extensions. * * As libostd provides a range system, it represents string slices as * contiguous ranges of characters. This has many advantages, such as * being able to use them with generic algorithms. The string slices are * not zero terminated, which means creating subslices is very fast, it's * basically just pointer arithmetic. * * Integration with existing string handling facilities is ensured, so you * can incorporate libostd into any existing project and still benefit from * the new features. * * A simple example: * * ~~~{.cc} * #include * #include * * int main() { * ostd::string_range x = "hello world"; * auto p1 = x.slice(0, 5); * auto p2 = x.slice(6); * ostd::writeln(p1); // hello * ostd::writeln(p2); // world * } * ~~~ * * An example of using libostd string formatting: * * @include format.cc * * See the examples provided with the library for further information. * * @{ */ /** @file string.hh * * @brief String slice implementation as well as other utilities. * * This file implements string slices, their comparisons, utilities, * standard C++ string range integration, range literals, std::hash * support for string slices and others. * * @copyright See COPYING.md in the project tree for further information. */ #ifndef OSTD_STRING_HH #define OSTD_STRING_HH #include #include #include #include #include #include #include #include #include #include #include #include #include namespace ostd { static_assert( (sizeof(wchar_t) == sizeof(char)) || (sizeof(wchar_t) == sizeof(char16_t)) || (sizeof(wchar_t) == sizeof(char32_t)), "wchar_t must correspond to either char, char16_t or char32_t" ); namespace detail { OSTD_EXPORT std::size_t tstrlen(char const *p) noexcept; OSTD_EXPORT std::size_t tstrlen(char16_t const *p) noexcept; OSTD_EXPORT std::size_t tstrlen(char32_t const *p) noexcept; OSTD_EXPORT std::size_t tstrlen(wchar_t const *p) noexcept; } /** @addtogroup Strings * @{ */ /** @brief A string slice type. * * This is a contiguous range over a character type. The character type * can be any of the standard character types, of any size - for example * you would use `char32_t` to represent UTF-32 slices. * * The range is mutable, i.e. it implements the output range interface. */ template struct basic_char_range: input_range> { using range_category = contiguous_range_tag; using value_type = T; using reference = T &; using size_type = std::size_t; private: struct nat {}; public: /** @brief Constructs an empty slice. */ basic_char_range() noexcept: p_beg(nullptr), p_end(nullptr) {} /** @brief Constructs a slice from two pointers. * * The first pointer is the beginning of the slice * and the second pointer is just past the end. */ basic_char_range(value_type *beg, value_type *end) noexcept: p_beg(beg), p_end(end) {} /** @brief Constructs an empty slice. */ basic_char_range(std::nullptr_t) noexcept: p_beg(nullptr), p_end(nullptr) {} /** @brief Slices are arbitrarily copy constructible. */ basic_char_range(basic_char_range const &v) noexcept: p_beg(v.p_beg), p_end(v.p_end) {} /** @brief Slices can be constructed from string views. */ template basic_char_range(std::basic_string_view< std::remove_const_t, U > const &v) noexcept: p_beg{v.data()}, p_end{v.data() + v.size()} {} /** @brief Constructs a slice from a pointer or a static array. * * This constructor handles two cases. The input must be convertible * to `T *`, if it's not, this constructor is not enabled. Effectively, * if the input is a static array of `T`, the entire array is used to * create the slice, minus the potential zero at the end. If there is * no zero at the end, nothing is removed and the array is used whole. * If the input is not an array, the size is checked at runtime. */ template basic_char_range(U &&beg, std::enable_if_t< std::is_convertible_v, nat > = nat{}) noexcept: p_beg(beg) { if constexpr(std::is_array_v>) { std::size_t N = std::extent_v>; p_end = beg + N - (beg[N - 1] == '\0'); } else { p_end = beg + (beg ? detail::tstrlen(beg) : 0); } } /** @brief Constructs a slice from an std::basic_string. * * This uses the string's data to construct a matching slice. */ template basic_char_range( std::basic_string, STR, A> const &s ) noexcept: p_beg(s.data()), p_end(s.data() + s.size()) {} /** @brief Constructs a slice from a different but compatible slice. * * A pointer to the other slice's value type must be convertible to * a pointer to the new slice's value type, otherwise the constructor * will not be enabled. */ template >> basic_char_range(basic_char_range const &v) noexcept: p_beg(&v[0]), p_end(&v[v.size()]) {} /** @brief Slices are arbitrarily copy constructible. */ basic_char_range &operator=(basic_char_range const &v) noexcept { p_beg = v.p_beg; p_end = v.p_end; return *this; } /** @brief Assigns the slice's data from a matching std::basic_string. */ template basic_char_range &operator=( std::basic_string const &s ) noexcept { p_beg = s.data(); p_end = s.data() + s.size(); return *this; } /** @brief Assigns the slice's data from a pointer. * * The data pointed to by the argument must be zero terminated. */ basic_char_range &operator=(value_type *s) noexcept { p_beg = s; p_end = s + (s ? detail::tstrlen(s) : 0); return *this; } /** @brief Checks if the slice is empty. */ bool empty() const noexcept { return p_beg == p_end; } /** @brief Pops the first character out of the slice. * * This is bounds checked, std::out_of_range is thrown when * slice was already empty before popping out the character. * No changes are done to the slice if it throws. * * @throws std::out_of_range when empty. * * @see front(), pop_back() */ void pop_front() { if (p_beg == p_end) { throw std::out_of_range{"pop_front on empty range"}; } ++p_beg; } /** @brief Gets a reference to the first character. * * The behavior is undefined when the slice is empty. * * @see back(), pop_front() */ reference front() const noexcept { return *p_beg; } /** @brief Pops the last character out of the slice. * * This is bounds checked, std::out_of_range is thrown when * slice was already empty before popping out the character. * No changes are done to the slice if it throws. * * @throws std::out_of_range when empty. * * @see back(), pop_front() */ void pop_back() { if (p_beg == p_end) { throw std::out_of_range{"pop_back on empty range"}; } --p_end; } /** @brief Gets a reference to the last character. * * The behavior is undefined when the slice is empty. * * @see front(), pop_back() */ reference back() const noexcept { return *(p_end - 1); } /** @brief Gets the number of value_type in the slice. */ size_type size() const noexcept { return size_type(p_end - p_beg); } /** @brief Gets the number of code points in the slice. * * Effectively the same as utf::length(). */ inline size_type length() const noexcept; /** @brief Gets the number of code points in the slice. * * Effectively the same as utf::length(). */ inline size_type length(basic_char_range &cont) const noexcept; /** @brief Creates a sub-slice of the slice. * * Behavior is undefined if `start` and `end` are not within the * slice's bounds. There is no bound checking done in this call. * It's also undefined if the first argument is larger than the * second argument. */ basic_char_range slice(size_type start, size_type end) const noexcept { return basic_char_range(p_beg + start, p_beg + end); } /** @brief Creates a sub-slice of the slice until the end. * * Equivalent to slice(size_type, size_type) with `size()` as * the second argument. The first argument must be within the * slice's boundaries otherwis the behavior is undefined. */ basic_char_range slice(size_type start) const noexcept { return slice(start, size()); } /** @brief Gets a reference to a character within the slice. * * The behavior is undefined if the index is not within the bounds. */ reference operator[](size_type i) const noexcept { return p_beg[i]; } /** @brief Writes a character at the beginning and pops it out. * * @throws std::out_of_range when empty. */ void put(value_type v) { if (p_beg == p_end) { throw std::out_of_range{"put into an empty range"}; } *(p_beg++) = v; } /** @brief Gets the pointer to the beginning. */ value_type *data() noexcept { return p_beg; } /** @brief Gets the pointer to the beginning. */ value_type const *data() const noexcept { return p_beg; } /** @brief Compares two slices. * * This works similarly to the C function `strcmp` or the `compare` * method of std::char_traits, but does not depend on the strings * to be terminated. * * It performs an ordinary lexicographical comparison, the values * are compared and the first string to have a lesser value is * considered lexicographically less. If they are equal up to a * point but one of them terminates early, it's also less. * * If the `this` slice is the lesser one, a negative value is * returned. If they are equal (if they're both zero length, * it counts as equal) then `0` is returned. Otherwise, a * positive value is returned. * * This works with the slice's native unit values, i.e. bytes * for UTF-8, `char16_t` for UTF-16 and `char32_t` for UTF-32. * These units are compared by getting the difference between * them (i.e. `this[index] - other[index]`). * * It is not a part of the range interface, just the string slice * interface. * * @see case_compare() */ int compare(basic_char_range s) const noexcept { size_type s1 = size(), s2 = s.size(); for (size_type i = 0, ms = std::min(s1, s2); i < ms; ++i) { int d = int(p_beg[i]) - int(s[i]); if (d) { return d; } } return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0); } /** @brief Compares two slices in a case insensitive manner. * * Works exactly the same as compare(), but in a case insensitive * way, i.e. it lowercases the characters and compares them after * that. * * For UTF-8, it decodes the string on the fly, then lowercases the * decoded code points and uses their difference (without encoding * them back). If the decoding fails, the failing code unit is used * as-is, so this function never fails. Identical treatment is given * to UTF-16. */ inline int case_compare(basic_char_range s) const noexcept; /** @brief Iterate over the Unicode units of the given type. * * Like utf::iter_u(). */ template inline auto iter_u() const; /** @brief Iterate over the Unicode units of the size in bits. * * The type maps to `char` for 8, `char16_t` for 16 and `char32_t` * for 32, or UTF-8, UTF-16 and UTF-32. * * Like utf::iter_u(). */ template inline auto iter_u() const; /** @brief Implicitly converts a string slice to std::basic_string_view. * * String views represent more or less the same thing but they're always * immutable. This simple conversion allows usage of string slices on * any API that uses either strings or string view, as well as construct * strings and string views out of slices. */ operator std::basic_string_view>() const noexcept { return std::basic_string_view>{data(), size()}; } private: T *p_beg, *p_end; }; /** @brief A mutable slice over `char`. */ using char_range = basic_char_range; /** @brief A mutable slice over `wchar_t`. */ using wchar_range = basic_char_range; /** @brief A mutable slice over `char16_t`. */ using char16_range = basic_char_range; /** @brief A mutable slice over `char32_t`. */ using char32_range = basic_char_range; /** @brief An immutable slice over `char`. * * This is used in most libostd APIs that read strings. More or less * anything is convertible to it, including mutable slices, so it's * a perfect fit as long as modifications are not necessary. */ using string_range = basic_char_range; /** @brief An immutable slice over `wchar_t`. * * Included primarily for compatibility with other APIs. */ using wstring_range = basic_char_range; /** @brief An immutable slice over `char16_t`. * * Included for basic UTF-16 compatibility. */ using u16string_range = basic_char_range; /** @brief An immutable slice over `char32_t`. * * Can represent UTF-32 strings. */ using u32string_range = basic_char_range; /* comparisons between utf-8 ranges */ /** @brief Like `!lhs.compare(rhs)`. */ inline bool operator==(string_range lhs, string_range rhs) noexcept { return !lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs)`. */ inline bool operator!=(string_range lhs, string_range rhs) noexcept { return lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs) < 0`. */ inline bool operator<(string_range lhs, string_range rhs) noexcept { return lhs.compare(rhs) < 0; } /** @brief Like `lhs.compare(rhs) > 0`. */ inline bool operator>(string_range lhs, string_range rhs) noexcept { return lhs.compare(rhs) > 0; } /** @brief Like `lhs.compare(rhs) <= 0`. */ inline bool operator<=(string_range lhs, string_range rhs) noexcept { return lhs.compare(rhs) <= 0; } /** @brief Like `lhs.compare(rhs) >= 0`. */ inline bool operator>=(string_range lhs, string_range rhs) noexcept { return lhs.compare(rhs) >= 0; } /* comparisons between utf-16 ranges */ /** @brief Like `!lhs.compare(rhs)`. */ inline bool operator==(u16string_range lhs, u16string_range rhs) noexcept { return !lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs)`. */ inline bool operator!=(u16string_range lhs, u16string_range rhs) noexcept { return lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs) < 0`. */ inline bool operator<(u16string_range lhs, u16string_range rhs) noexcept { return lhs.compare(rhs) < 0; } /** @brief Like `lhs.compare(rhs) > 0`. */ inline bool operator>(u16string_range lhs, u16string_range rhs) noexcept { return lhs.compare(rhs) > 0; } /** @brief Like `lhs.compare(rhs) <= 0`. */ inline bool operator<=(u16string_range lhs, u16string_range rhs) noexcept { return lhs.compare(rhs) <= 0; } /** @brief Like `lhs.compare(rhs) >= 0`. */ inline bool operator>=(u16string_range lhs, u16string_range rhs) noexcept { return lhs.compare(rhs) >= 0; } /* comparisons between utf-32 ranges */ /** @brief Like `!lhs.compare(rhs)`. */ inline bool operator==(u32string_range lhs, u32string_range rhs) noexcept { return !lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs)`. */ inline bool operator!=(u32string_range lhs, u32string_range rhs) noexcept { return lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs) < 0`. */ inline bool operator<(u32string_range lhs, u32string_range rhs) noexcept { return lhs.compare(rhs) < 0; } /** @brief Like `lhs.compare(rhs) > 0`. */ inline bool operator>(u32string_range lhs, u32string_range rhs) noexcept { return lhs.compare(rhs) > 0; } /** @brief Like `lhs.compare(rhs) <= 0`. */ inline bool operator<=(u32string_range lhs, u32string_range rhs) noexcept { return lhs.compare(rhs) <= 0; } /** @brief Like `lhs.compare(rhs) >= 0`. */ inline bool operator>=(u32string_range lhs, u32string_range rhs) noexcept { return lhs.compare(rhs) >= 0; } /* comparisons between wide ranges */ /** @brief Like `!lhs.compare(rhs)`. */ inline bool operator==(wstring_range lhs, wstring_range rhs) noexcept { return !lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs)`. */ inline bool operator!=(wstring_range lhs, wstring_range rhs) noexcept { return lhs.compare(rhs); } /** @brief Like `lhs.compare(rhs) < 0`. */ inline bool operator<(wstring_range lhs, wstring_range rhs) noexcept { return lhs.compare(rhs) < 0; } /** @brief Like `lhs.compare(rhs) > 0`. */ inline bool operator>(wstring_range lhs, wstring_range rhs) noexcept { return lhs.compare(rhs) > 0; } /** @brief Like `lhs.compare(rhs) <= 0`. */ inline bool operator<=(wstring_range lhs, wstring_range rhs) noexcept { return lhs.compare(rhs) <= 0; } /** @brief Like `lhs.compare(rhs) >= 0`. */ inline bool operator>=(wstring_range lhs, wstring_range rhs) noexcept { return lhs.compare(rhs) >= 0; } /** @brief Checks if a string slice starts with another slice. */ inline bool starts_with(string_range a, string_range b) noexcept { if (a.size() < b.size()) { return false; } return a.slice(0, b.size()) == b; } /** @brief Checks if a string slice starts with another slice. */ inline bool starts_with(u16string_range a, u16string_range b) noexcept { if (a.size() < b.size()) { return false; } return a.slice(0, b.size()) == b; } /** @brief Checks if a string slice starts with another slice. */ inline bool starts_with(u32string_range a, u32string_range b) noexcept { if (a.size() < b.size()) { return false; } return a.slice(0, b.size()) == b; } /** @brief Checks if a string slice starts with another slice. */ inline bool starts_with(wstring_range a, wstring_range b) noexcept { if (a.size() < b.size()) { return false; } return a.slice(0, b.size()) == b; } /** @brief Mutable range integration for std::basic_string. * * The range type used for mutable string references * is an ostd::basic_char_range with mutable values. */ template struct ranged_traits> { /** @brief The range type. */ using range = basic_char_range; /** @brief Creates a range. */ static range iter(std::basic_string &v) noexcept { return range{v.data(), v.data() + v.size()}; } }; /** @brief Immutable range integration for std::basic_string. * * The range type used for immutable string references * is an ostd::basic_char_range with immutable values. */ template struct ranged_traits const> { /** @brief The range type. */ using range = basic_char_range; /** @brief Creates a range. */ static range iter(std::basic_string const &v) noexcept { return range{v.data(), v.data() + v.size()}; } }; /* more UTF utilities beyond basic API */ namespace utf { /** @addtogroup Strings * @{ */ /** @brief A Unicode type of the same size as `wchar_t`. * * This can be an alias to either `char32_t`, `char16_t` or `char` * representing UTF-32, UTF-16 or UTF-8 respectively. It represents * a Unicode character type that in a platform specific way represents * one of the 3 encodings. It follows the same order so if `wchar_t` * can fit UTF-32, it's UTF-32, otherwise it tries UTF-16 and UTF-8 * as the next best thing. * * On most platforms and all platforms that we care about, this does * the right thing; UTF-32 on most Unix-like systems (with 32-bit * `wchar_t`), UTF-16 on Windows (with 16-bit `wchar_t`). */ using wchar_fixed_t = std::conditional_t< sizeof(wchar_t) == sizeof(char32_t), char32_t, std::conditional_t< sizeof(wchar_t) == sizeof(char16_t), char16_t, char > >; namespace detail { template struct max_units_base; template<> struct max_units_base { static constexpr std::size_t const value = 1; static constexpr std::size_t const bits = 32; }; template<> struct max_units_base { static constexpr std::size_t const value = 2; static constexpr std::size_t const bits = 16; }; template<> struct max_units_base { static constexpr std::size_t const value = 4; static constexpr std::size_t const bits = 8; }; template<> struct max_units_base: max_units_base {}; } /* namespace detail */ /** @brief The maximum number of code units to represent a code point. * * The allowed input types are `char`, `char16_t`, `char32_t`, `wchar_t`. * The result is 4, 2, 1 and platform-dependent (usually 1 or 2). It * makes use of utf::wchar_fixed_t for `wchar_t`. */ template static inline constexpr std::size_t const max_units = detail::max_units_base::value; /** @brief The number of bits for a code unit type. * * Keep in mind that this does not represent the *actual* number of bits * in the type, just the smallest number of bits for an integer type that * can hold all code units of the encoding. So for `char` it's 8, for * `char16_t` it's 16, for `char32_t` it's 32, for `wchar_t` it's * platform defined. */ template static inline constexpr std::size_t const unit_bits = detail::max_units_base::bits; namespace detail { template struct unicode_t_base; template<> struct unicode_t_base<32> { using type = char32_t; }; template<> struct unicode_t_base<16> { using type = char16_t; }; template<> struct unicode_t_base<8> { using type = char; }; } /** @brief The UTF unit type according to utf::unit_bits. * * For 8 this will be `char`, for 16 `char16_t`, for 32 `char32_t`. */ template using unicode_t = typename detail::unicode_t_base::type; /** @brief A normalized Unicode type for the given character type. * * This will map to itself for all character types but `wchar_t`, * which maps to utf::wchar_fixed_t. */ template using unicode_base_t = unicode_t>; /** @brief A constant to tell if `wchar_t` is UTF-32. * * No actual checks are performed, utf::wchar_fixed_t is used to tell. */ static inline constexpr bool const is_wchar_u32 = std::is_same_v; /** @brief A constant to tell if `wchar_t` is UTF-16. * * No actual checks are performed, utf::wchar_fixed_t is used to tell. */ static inline constexpr bool const is_wchar_u16 = std::is_same_v; /** @brief A constant to tell if `wchar_t` is UTF-8. * * No actual checks are performed, utf::wchar_fixed_t is used to tell. */ static inline constexpr bool const is_wchar_u8 = std::is_same_v; namespace detail { template struct is_character_base { static constexpr bool const value = false; }; template<> struct is_character_base { static constexpr bool const value = true; }; template<> struct is_character_base { static constexpr bool const value = true; }; template<> struct is_character_base { static constexpr bool const value = true; }; template<> struct is_character_base { static constexpr bool const value = true; }; } /** @brief Check whether the input type is a character type. * * For `char`, `char16_t`, `char32_t` and `wchar_t` this is true, * otherwise it's false. */ template static inline constexpr bool const is_character = detail::is_character_base::value; /** @brief The maximum value a Unicode code point can have. */ static inline constexpr char32_t const max_unicode = 0x10FFFF; /** @brief Thrown on UTF-8 decoding failure. */ struct OSTD_EXPORT utf_error: std::runtime_error { using std::runtime_error::runtime_error; /* empty, for vtable placement */ virtual ~utf_error(); }; /* @brief Get the Unicode code point for a UTF-8 sequence. * * The string is advanced past the Unicode character in the front. * If the decoding fails, `false` is returned, otherwise it's `true`. */ OSTD_EXPORT bool decode(string_range &r, char32_t &ret) noexcept; /* @brief Get the Unicode code point for a UTF-16 sequence. * * The string is advanced past the Unicode character in the front. * If the decoding fails, `false` is returned, otherwise it's `true`. */ OSTD_EXPORT bool decode(u16string_range &r, char32_t &ret) noexcept; /* @brief Get the Unicode code point from a UTF-32 string. * * The string is advanced by one. It can also fail if utf::isvalid() * returns `false` for the front character, in which case the string * will not be advanced. */ OSTD_EXPORT bool decode(u32string_range &r, char32_t &ret) noexcept; /* @brief Get the Unicode code point for a wide Unicode char/sequence. * * The input is treated as either UTF-8, UTF-16 or UTF-32 depending * on the size of the wide character. Typically, it will be UTF-16 * on Windows and UTF-32 on Unix-like systems, with UTF-32 taking * priority (on systems where two or more of the types are the same * size). */ OSTD_EXPORT bool decode(wstring_range &r, char32_t &ret) noexcept; namespace detail { OSTD_EXPORT std::size_t encode( char (&ret)[4], char32_t ch ) noexcept; OSTD_EXPORT std::size_t encode( char16_t (&ret)[2], char32_t ch ) noexcept; } /* @brief Encode a Unicode code point in the given encoding. * * The encoding is specified by the template parameter `C` which * can be one of the character types (utf::is_character), the * encoding used is picked based on utf::max_units. * * The return value is the number of values written into `sink`. * If none were written, the encoding failed. * * If your input is a string and you want to advance it, use the * utf::encode(R, basic_char_range) variant. */ template inline std::size_t encode(R &sink, char32_t ch) { std::size_t ret; if constexpr(max_units == 1) { sink.put(C(ch)); ret = 1; } else { unicode_base_t buf[max_units]; ret = detail::encode(buf, ch); for (std::size_t i = 0; i < ret; ++i) { sink.put(C(buf[i])); } } return ret; } /* @brief Encode a Unicode code point from a string in the given encoding. * * The encoding is specified by the template parameter `C` which * can be one of the character types (utf::is_character), the * encoding used is picked based on utf::max_units. * * Unlike utf::encode(R, char32_t), this takes a string as a second * input and the string can be in any UTF encoding and use any of the * available character types. The function advances the string by one * code point, which may mean multiple values. * * The return value is the number of values written into `sink`. * If none were written, the encoding failed and the string is not * advanced. */ template inline std::size_t encode(OR &sink, IR &r) { using IC = std::remove_const_t>; if constexpr(max_units == 1) { std::size_t n = 0; if (!r.empty() && (n = utf::encode(sink, char32_t(r.front())))) { r.pop_front(); } return n; } else if constexpr(max_units == max_units) { /* FIXME: advance by a whole character always */ if (!r.empty()) { sink.put(C(r.front())); r.pop_front(); return 1; } } else { auto rr = r; if (char32_t ch; utf::decode(rr, ch)) { if (std::size_t n; (n = utf::encode(sink, ch))) { r = rr; return n; } } } return 0; } /** @brief Encode a Unicode code point from a string in the given encoding. * * This is the same as utf::encode() but takes the number of bits as in * utf::unit_bits as an input instead of a type, mapping 8 to `char`, * 16 to `char16_t` and 32 to `char32_t`. */ template inline std::size_t encode(OR &sink, IR &r) { return encode>(sink, r); } /** @brief Write a Unicode replacement character into the sink. * * Depending on the type input, this encodes the replacement character * into the sink as either UTF-8, UTF-16 or UTF-32. ALl 4 character types * are allowed. */ template inline std::size_t replace(R &sink) { if constexpr(max_units > 2) { sink.put(range_value_t(0xEF)); sink.put(range_value_t(0xBF)); sink.put(range_value_t(0xBD)); return 3; } else { sink.put(range_value_t(0xFFFD)); } return 1; } /** @brief Write a Unicode replacement character into the sink. * * This is the same as utf::replace() but takes the number of bits as in * utf::unit_bits as an input instead of a type, mapping 8 to `char`, * 16 to `char16_t` and 32 to `char32_t`. */ template inline std::size_t replace(R &sink) { return replace>(sink); } /** @brief Get the number of Unicode code points in a string. * * This function keeps reading Unicode code points while it can and * once it can't it returns the number of valid ones with the rest * of the input string range being in `cont`. That means if the entire * string is a valid UTF-8 string, `cont` will be empty, otherwise it * will begin at the first invalid UTF-8 code unit. * * If you're sure the string is valid or you don't need to handle the * error, you can use the more convenient overload below. */ OSTD_EXPORT std::size_t length(string_range r, string_range &cont) noexcept; /** @brief Get the number of Unicode code points in a string. * * This function keeps reading Unicode code points while it can and * once it can't it returns the number of valid ones with the rest * of the input string range being in `cont`. That means if the entire * string is a valid UTF-16 string, `cont` will be empty, otherwise it * will begin at the first invalid UTF-16 code unit. * * If you're sure the string is valid or you don't need to handle the * error, you can use the more convenient overload below. */ OSTD_EXPORT std::size_t length(u16string_range r, u16string_range &cont) noexcept; /** @brief Get the number of Unicode code points in a string. * * This function keeps reading Unicode code points while it can and * once it can't it returns the number of valid ones with the rest * of the input string range being in `cont`. That means if the entire * string is a valid UTF-32 string, `cont` will be empty, otherwise it * will begin at the first invalid code point. * * If you're sure the string is valid or you don't need to handle the * error, you can use the more convenient overload below. */ OSTD_EXPORT std::size_t length(u32string_range r, u32string_range &cont) noexcept; /** @brief Get the number of Unicode code points in a string. * * This function keeps reading Unicode code points while it can and * once it can't it returns the number of valid ones with the rest * of the input string range being in `cont`. That means if the entire * string is a valid Unicode wide string, `cont` will be empty, * otherwise it will begin at the first invalid code unit. * * If you're sure the string is valid or you don't need to handle the * error, you can use the more convenient overload below. * * The behavior of this function is platform dependent as wide * characters represent different things on different systems. */ OSTD_EXPORT std::size_t length(wstring_range r, wstring_range &cont) noexcept; /** @brief Get the number of Unicode code points in a UTF-8 string. * * If an invalid UTF-8 sequence is encountered, it's considered * 1 character and therefore the resulting length will be the * number of valid code points plus the number of invalid * code units as if they were replaced with valid code points. * * If you need to stop at an invalid code unit and get the * continuation string, use the overload above. */ OSTD_EXPORT std::size_t length(string_range r) noexcept; /** @brief Get the number of Unicode code points in a UTF-16 string. * * If an invalid UTF-16 sequence is encountered, it's considered * 1 character and therefore the resulting length will be the * number of valid code points plus the number of invalid * code units as if they were replaced with valid code points. * * If you need to stop at an invalid code unit and get the * continuation string, use the overload above. */ OSTD_EXPORT std::size_t length(u16string_range r) noexcept; /** @brief Get the number of Unicode code points in a UTF-32 string. * * This, like the above overloads for multibyte encodings, treats * invalid values as code points, so this function effectively just * returns the size of the given range. */ OSTD_EXPORT std::size_t length(u32string_range r) noexcept; /** @brief Get the number of Unicode code points in a wide string. * * If an invalid sequence is encountered, it's considered * 1 character and therefore the resulting length will be the * number of valid code points plus the number of invalid * code units as if they were replaced with valid code points. * * If you need to stop at an invalid code unit and get the * continuation string, use the overload above. * * The behavior of this function is platform dependent as wide * characters represent different things on different systems. */ OSTD_EXPORT std::size_t length(wstring_range r) noexcept; namespace detail { template struct unicode_range: input_range> { using range_category = forward_range_tag; using value_type = OC; using reference = OC; using size_type = std::size_t; unicode_range() = delete; unicode_range(basic_char_range r): p_range(r) { if (!r.empty()) { advance(); } } bool empty() const { return p_left.empty(); } void pop_front() { std::size_t n = p_left.size(); if (n) { bool done = false; if constexpr(max_units > 1) { done = (n > 1); } if (done || p_range.empty()) { p_left.pop_front(); return; } } advance(); } OC front() const { return p_left.front(); } private: void advance() { auto r = basic_char_range(p_buf, p_buf + max_units); if (std::size_t n; !(n = utf::encode(r, p_range))) { /* range is unchanged */ p_left = basic_char_range{}; throw utf_error{"Unicode encoding failed"}; } else { p_left = basic_char_range{p_buf, p_buf + n}; } } basic_char_range p_range; basic_char_range p_left{}; OC p_buf[max_units]; }; } /* namespace detail */ /** @brief Iterate a Unicode string as a different encoding. * * This returns an ostd::forward_range_tag that will iterate over * the given Unicode character range as a different UTF encoding. * The UTF encoding is specified using the given type `C`, being * UTF-9 for `char`, UTF-16 for `char16_t`, UTF-32 for `char32_t` * and platform specific for `wchar_t`. */ template inline auto iter_u(R &&str) { return detail::unicode_range< std::remove_const_t< range_value_t>> >, C >(std::forward(str)); } /** @brief Like ostd::iter_u but taking bits to specify the encoding. * * This uses utf::unicode_t with `N` to call utf::iter_u(). */ template inline auto iter_u(R &&str) { return iter_u>(std::forward(str)); } /** @brief Check whether a code point is alphanumeric. * * This is true for either utf::isalpha() or utf::isdigit(). Also * equivalent to std::isalnum(). */ OSTD_EXPORT bool isalnum(char32_t c) noexcept; /** @brief Check whether a code point is alphabetic. * * This is like std::isalpha() but strictly Unicode and works on the * entire code point range. Returns true for alphabetic characters, * false for others. * * The categories considered alphabetic are `L*`. */ OSTD_EXPORT bool isalpha(char32_t c) noexcept; /** @brief Check whether a code point is a blank. * * This is like std::isblank() but strictly Unicode and works on the * entire code point range. Returns true for blanks, false for others. * * The blank characters are only space (U+20) and tab (`U+9`). */ OSTD_EXPORT bool isblank(char32_t c) noexcept; /** @brief Check whether a code point is a control character. * * This is like std::iscntrl() but strictly Unicode and works on the * entire code point range. Returns true for blanks, false for others. * * The category considered control characters is `Cc`. */ OSTD_EXPORT bool iscntrl(char32_t c) noexcept; /** @brief Check whether a code point is a digit. * * This is like std::isdigit() but strictly Unicode and works on the * entire code point range. Returns true for digit characters, * false for others. * * The category considered a digit is `Nd`. */ OSTD_EXPORT bool isdigit(char32_t c) noexcept; /** @brief Check whether a code point is graphic. * * This is true when the input is not utf::isspace() and is * utf::isprint(). Also equivalent to std::isgraph(). */ OSTD_EXPORT bool isgraph(char32_t c) noexcept; /** @brief Check whether a code point is lowercase. * * This is like std::islower() but strictly Unicode and works on the * entire code point range. Returns true for lowercase characters, * false for others. * * The category considered a lowercase is `Ll`. */ OSTD_EXPORT bool islower(char32_t c) noexcept; /** @brief Check whether a code point is printable. * * Equivalent to std::isprint() but for Unicode. This is true for * all characters that are not utf::iscntrl() and that are not * U+2028, U+2029, U+FFF9, U+FFFA, U+FFFB. */ OSTD_EXPORT bool isprint(char32_t c) noexcept; /** @brief Check whether a code point is punctuation. * * This is like std::ispunct() but strictly Unicode and works on the * entire code point range. Returns true for punctuation characters, * false for others. Punctuation characters are those that satisfy * utf::isgraph() but are not utf::isalnum(). */ OSTD_EXPORT bool ispunct(char32_t c) noexcept; /** @brief Check whether a code point is a whitespace. * * This is like std::isspace() but strictly Unicode and works on the * entire code point range. Returns true for whitespace, false for others. * * The categories considered blanks are `Z*` with the `B`, `S` and `WS` * bidirectional categories. */ OSTD_EXPORT bool isspace(char32_t c) noexcept; /** @brief Check whether a code point is titlecase. * * This has no standard ctype equivalent. Returns true for * titlecase characters, false for others. * * The category considered a uppercase is `Lt`. */ OSTD_EXPORT bool istitle(char32_t c) noexcept; /** @brief Check whether a code point is uppercase. * * This is like std::isipper but strictly Unicode and works on the * entire code point range. Returns true for ippercase characters, * false for others. * * The category considered a uppercase is `Lu`. */ OSTD_EXPORT bool isupper(char32_t c) noexcept; /** @brief Check whether a code point is a valid character. * * This is all code points within the range (utf::max_unicode) * that are not surrogate code points (U+D800 to U+DFFF), * non-characters (U+FDD0 to U+FDEF) and end-of-plane * characters (U+FFFE and U+FFFF). * * This is Unicode specific and has no standard ctype equivalent. */ OSTD_EXPORT bool isvalid(char32_t c) noexcept; /** @brief Check whether a code point is a hexadecimal digit. * * This only considers the ASCII character range, returning * true for digits (U+30 to U+39) as well as letters A to F * in lowercase and uppercase (U+41 to U+46, U+61 to U+66). * * Behaves exactly the same as std::isxdigit() in the C locale, * but unlike the former it never changes behavior, i.e. it cannot * support codepage extensions, being Unicode only. */ OSTD_EXPORT bool isxdigit(char32_t c) noexcept; /** @brief Convert a Unicode code point to lowercase. * * Like std::tolower() but works with Unicode code points. If the * code point is already lowercase or has no lowercase equivalent, * this just returns the input unchanged, otherwise it returns the * matching lowercase variant. */ OSTD_EXPORT char32_t tolower(char32_t c) noexcept; /** @brief Convert a Unicode code point to uppercase. * * Like std::toupper() but works with Unicode code points. If the * code point is already uppercase or has no uppercase equivalent, * this just returns the input unchanged, otherwise it returns the * matching uppercase variant. */ OSTD_EXPORT char32_t toupper(char32_t c) noexcept; /** @brief Compare two UTF-8 strings. * * Basically returns `s1.compare(s2)`, so for detailed documentation * please refer to basic_char_range::compare(). */ inline int compare(string_range s1, string_range s2) noexcept { return s1.compare(s2); } /** @brief Compare two UTF-16 strings. * * Basically returns `s1.compare(s2)`, so for detailed documentation * please refer to basic_char_range::compare(). */ inline int compare(u16string_range s1, u16string_range s2) noexcept { return s1.compare(s2); } /** @brief Compare two UTF-32 strings. * * Basically returns `s1.compare(s2)`, so for detailed documentation * please refer to basic_char_range::compare(). */ inline int compare(u32string_range s1, u32string_range s2) noexcept { return s1.compare(s2); } /** @brief Compare two wide strings. * * Basically returns `s1.compare(s2)`, so for detailed documentation * please refer to basic_char_range::compare(). */ inline int compare(wstring_range s1, wstring_range s2) noexcept { return s1.compare(s2); } /** @brief Compare two UTF-8 strings as case insensitive. * * The case insensitive comparison is done by advancing by code points * and converting each code point to lowercase using utf::tolower() * before doing the comparison, with invalid code units being * compared as they are (so this function never fails). * * @see basic_char_range::case_compare() */ OSTD_EXPORT int case_compare(string_range s1, string_range s2) noexcept; /** @brief Compare two UTF-16 strings as case insensitive. * * The case insensitive comparison is done by advancing by code points * and converting each code point to lowercase using utf::tolower() * before doing the comparison, with invalid code units being * compared as they are (so this function never fails). * * @see basic_char_range::case_compare() */ OSTD_EXPORT int case_compare(u16string_range s1, u16string_range s2) noexcept; /** @brief Compare two UTF-32 strings as case insensitive. * * The case insensitive comparison is done by converting each code * point to lowercase using utf::tolower() before doing the comparison, * with invalid code points being compared as they are (so this function * never fails). * * @see basic_char_range::case_compare() */ OSTD_EXPORT int case_compare(u32string_range s1, u32string_range s2) noexcept; /** @brief Compare two wide strings as case insensitive. * * The case insensitive comparison is done by advancing by code points * and converting each code point to lowercase using utf::tolower() * before doing the comparison, with invalid code units being * compared as they are (so this function never fails). * * The internal behavior of this function is platform specific * depending on the size of `wchar_t`. * * @see basic_char_range::case_compare() */ OSTD_EXPORT int case_compare(wstring_range s1, wstring_range s2) noexcept; /** @} */ } /* namespace utf */ template inline std::size_t basic_char_range::length() const noexcept { return utf::length(*this); } template inline std::size_t basic_char_range::length( basic_char_range &cont ) const noexcept { return utf::length(*this, cont); } template template inline auto basic_char_range::iter_u() const { return utf::iter_u(*this); } template template inline auto basic_char_range::iter_u() const { return utf::iter_u(*this); } template inline int basic_char_range::case_compare( basic_char_range s ) const noexcept { return utf::case_compare(*this, s); } /* string literals */ inline namespace literals { inline namespace string_literals { /** @addtogroup Strings * @{ */ /** @brief A custom literal for string ranges. * * You need to enable this explicitly by using this namespace. It's * not enabled by default to ensure compatibility with existing code. */ inline string_range operator "" _sr(char const *str, std::size_t len) noexcept { return string_range(str, str + len); } /** @} */ } } /** @} */ } /* namespace ostd */ namespace std { /** @addtogroup Strings * @{ */ /** @brief Standard std::hash integration for string slices. * * This integrates all possible slice types with standard hashing. * It uses the hashing used for matching std::basic_string_view, * so the algorithm (and thus result) will always match standard strings. */ template struct hash> { std::size_t operator()(ostd::basic_char_range const &v) const noexcept { return hash>>{}(v); } }; /** @} */ } #endif /** @} */