diff --git a/ostd/string.hh b/ostd/string.hh index 5d03e8f..5bfac92 100644 --- a/ostd/string.hh +++ b/ostd/string.hh @@ -296,11 +296,20 @@ public: * method of std::char_traits, but does not depend on the strings * to be terminated. * - * If this slice is empty and the other is not, this method returns - * -1. If it's the other way around, it returns 1. If both are empty, - * 0 is returned. Otherwise, the `compare` method of std::char_traits - * is used to compare the data, using the smaller of the lengths as the - * count. + * It performs an ordinary lexicographical comparison, the values + * are compared and the first string to have a lesser value is + * considered lexicographically less. If they are equal up to a + * point but one of them terminates early, it's also less. + * + * If the `this` slice is the lesser one, a negative value is + * returned. If they are equal (if they're both zero length, + * it counts as equal) then `0` is returned. Otherwise, a + * positive value is returned. + * + * This works with the slice's native unit values, i.e. bytes + * for UTF-8, `char16_t` for UTF-16 and `char32_t` for UTF-32. + * These units are compared by getting the difference between + * them (i.e. `this[index] - other[index]`). * * It is not a part of the range interface, just the string slice * interface. @@ -308,32 +317,9 @@ public: * @see case_compare() */ int compare(basic_char_range s) const noexcept { - size_type s1 = size(), s2 = s.size(); - int ret; - if (!s1 || !s2) { - goto diffsize; - } - if ((ret = TR::compare(data(), s.data(), std::min(s1, s2)))) { - return ret; - } -diffsize: - return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0); - } - - /** @brief Compares two slices in a case insensitive manner. - * - * Lexicographically compares the strings like compare(), but in - * a case insensitive way. The std::toupper() function is used to - * convert the characters to uppercase when comparing. - * - * Returns a negative value when this slice is less than the other - * slice and a positive value when the other way around. Zero is - * returned when they're equal. - */ - int case_compare(basic_char_range s) const noexcept { size_type s1 = size(), s2 = s.size(); for (size_type i = 0, ms = std::min(s1, s2); i < ms; ++i) { - int d = std::toupper(p_beg[i]) - std::toupper(s[i]); + int d = p_beg[i] - s[i]; if (d) { return d; } @@ -341,6 +327,20 @@ diffsize: return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0); } + /** @brief Compares two slices in a case insensitive manner. + * + * Works exactly the same as compare(), but in a case insensitive + * way, i.e. it lowercases the characters and compares them after + * that. + * + * For UTF-8, it decodes the string on the fly, then lowercases the + * decoded code points and uses their difference (without encoding + * them back). If the decoding fails, the failing code unit is used + * as-is, so this function never fails. Identical treatment is given + * to UTF-16. + */ + inline int case_compare(basic_char_range s) const noexcept; + /** @brief Iterate over the code points of the string. * * Like utf::iter_codes(). @@ -905,6 +905,15 @@ namespace utf { char32_t tolower(char32_t c); char32_t toupper(char32_t c); + inline int compare(string_range s1, string_range s2) noexcept { + return s1.compare(s2); + } + inline int compare(u32string_range s1, u32string_range s2) noexcept { + return s1.compare(s2); + } + + int case_compare(string_range s1, string_range s2) noexcept; + int case_compare(u32string_range s1, u32string_range s2) noexcept; /** @} */ } /* namespace utf */ @@ -926,6 +935,13 @@ inline auto basic_char_range::iter_codes() const { return utf::iter_codes(*this); } +template +inline int basic_char_range::case_compare( + basic_char_range s +) const noexcept { + return utf::case_compare(*this, s); +} + /* string literals */ inline namespace literals { diff --git a/src/string.cc b/src/string.cc index d914f19..dbcd5a7 100644 --- a/src/string.cc +++ b/src/string.cc @@ -340,5 +340,36 @@ char32_t toupper(char32_t c) { #endif /* __has_include("string_utf.hh") */ +int case_compare(string_range s1, string_range s2) noexcept { + std::size_t s1l = s1.size(), s2l = s2.size(), ms = std::min(s1l, s2l); + s1 = s1.slice(0, ms); + s2 = s2.slice(0, ms); + for (;;) { + char32_t ldec = s1.front(), rdec = s2.front(); + if ((ldec <= 0x7F) || !utf::decode(s1, ldec)) { + s1.pop_front(); + } + if ((rdec <= 0x7F) || !utf::decode(s2, rdec)) { + s2.pop_front(); + } + int d = int(utf::tolower(ldec)) - int(utf::tolower(rdec)); + if (d) { + return d; + } + } + return (s1l < s2l) ? -1 : ((s1 > s2) ? 1 : 0); +} + +int case_compare(u32string_range s1, u32string_range s2) noexcept { + std::size_t s1l = s1.size(), s2l = s2.size(); + for (std::size_t i = 0, ms = std::min(s1l, s2l); i < ms; ++i) { + int d = int(utf::tolower(s1[i])) - int(utf::tolower(s2[i])); + if (d) { + return d; + } + } + return (s1l < s2l) ? -1 : ((s1l > s2l) ? 1 : 0); +} + } /* namespace utf */ } /* namespace ostd */