unicode-aware case-insensitive string compares

master
Daniel Kolesa 2018-01-03 01:22:07 +01:00
parent 2949b2de0c
commit ad149ff0f6
2 changed files with 76 additions and 29 deletions

View File

@ -296,11 +296,20 @@ public:
* method of std::char_traits, but does not depend on the strings
* to be terminated.
*
* If this slice is empty and the other is not, this method returns
* -1. If it's the other way around, it returns 1. If both are empty,
* 0 is returned. Otherwise, the `compare` method of std::char_traits
* is used to compare the data, using the smaller of the lengths as the
* count.
* It performs an ordinary lexicographical comparison, the values
* are compared and the first string to have a lesser value is
* considered lexicographically less. If they are equal up to a
* point but one of them terminates early, it's also less.
*
* If the `this` slice is the lesser one, a negative value is
* returned. If they are equal (if they're both zero length,
* it counts as equal) then `0` is returned. Otherwise, a
* positive value is returned.
*
* This works with the slice's native unit values, i.e. bytes
* for UTF-8, `char16_t` for UTF-16 and `char32_t` for UTF-32.
* These units are compared by getting the difference between
* them (i.e. `this[index] - other[index]`).
*
* It is not a part of the range interface, just the string slice
* interface.
@ -308,32 +317,9 @@ public:
* @see case_compare()
*/
int compare(basic_char_range<value_type const> s) const noexcept {
size_type s1 = size(), s2 = s.size();
int ret;
if (!s1 || !s2) {
goto diffsize;
}
if ((ret = TR::compare(data(), s.data(), std::min(s1, s2)))) {
return ret;
}
diffsize:
return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0);
}
/** @brief Compares two slices in a case insensitive manner.
*
* Lexicographically compares the strings like compare(), but in
* a case insensitive way. The std::toupper() function is used to
* convert the characters to uppercase when comparing.
*
* Returns a negative value when this slice is less than the other
* slice and a positive value when the other way around. Zero is
* returned when they're equal.
*/
int case_compare(basic_char_range<value_type const> s) const noexcept {
size_type s1 = size(), s2 = s.size();
for (size_type i = 0, ms = std::min(s1, s2); i < ms; ++i) {
int d = std::toupper(p_beg[i]) - std::toupper(s[i]);
int d = p_beg[i] - s[i];
if (d) {
return d;
}
@ -341,6 +327,20 @@ diffsize:
return (s1 < s2) ? -1 : ((s1 > s2) ? 1 : 0);
}
/** @brief Compares two slices in a case insensitive manner.
*
* Works exactly the same as compare(), but in a case insensitive
* way, i.e. it lowercases the characters and compares them after
* that.
*
* For UTF-8, it decodes the string on the fly, then lowercases the
* decoded code points and uses their difference (without encoding
* them back). If the decoding fails, the failing code unit is used
* as-is, so this function never fails. Identical treatment is given
* to UTF-16.
*/
inline int case_compare(basic_char_range<value_type const> s) const noexcept;
/** @brief Iterate over the code points of the string.
*
* Like utf::iter_codes().
@ -905,6 +905,15 @@ namespace utf {
char32_t tolower(char32_t c);
char32_t toupper(char32_t c);
inline int compare(string_range s1, string_range s2) noexcept {
return s1.compare(s2);
}
inline int compare(u32string_range s1, u32string_range s2) noexcept {
return s1.compare(s2);
}
int case_compare(string_range s1, string_range s2) noexcept;
int case_compare(u32string_range s1, u32string_range s2) noexcept;
/** @} */
} /* namespace utf */
@ -926,6 +935,13 @@ inline auto basic_char_range<T>::iter_codes() const {
return utf::iter_codes(*this);
}
template<typename T>
inline int basic_char_range<T>::case_compare(
basic_char_range<T const> s
) const noexcept {
return utf::case_compare(*this, s);
}
/* string literals */
inline namespace literals {

View File

@ -340,5 +340,36 @@ char32_t toupper(char32_t c) {
#endif /* __has_include("string_utf.hh") */
int case_compare(string_range s1, string_range s2) noexcept {
std::size_t s1l = s1.size(), s2l = s2.size(), ms = std::min(s1l, s2l);
s1 = s1.slice(0, ms);
s2 = s2.slice(0, ms);
for (;;) {
char32_t ldec = s1.front(), rdec = s2.front();
if ((ldec <= 0x7F) || !utf::decode(s1, ldec)) {
s1.pop_front();
}
if ((rdec <= 0x7F) || !utf::decode(s2, rdec)) {
s2.pop_front();
}
int d = int(utf::tolower(ldec)) - int(utf::tolower(rdec));
if (d) {
return d;
}
}
return (s1l < s2l) ? -1 : ((s1 > s2) ? 1 : 0);
}
int case_compare(u32string_range s1, u32string_range s2) noexcept {
std::size_t s1l = s1.size(), s2l = s2.size();
for (std::size_t i = 0, ms = std::min(s1l, s2l); i < ms; ++i) {
int d = int(utf::tolower(s1[i])) - int(utf::tolower(s2[i]));
if (d) {
return d;
}
}
return (s1l < s2l) ? -1 : ((s1l > s2l) ? 1 : 0);
}
} /* namespace utf */
} /* namespace ostd */