add iter_codes to iterate a UTF-8 string by code points

master
Daniel Kolesa 2017-12-31 14:50:48 +01:00
parent d6a13d8f97
commit b2ee5c1bd0
2 changed files with 77 additions and 1 deletions

View File

@ -50,6 +50,7 @@
#ifndef OSTD_STRING_HH
#define OSTD_STRING_HH
#include <cstdint>
#include <cstddef>
#include <cctype>
#include <string>
@ -58,6 +59,7 @@
#include <functional>
#include <utility>
#include <vector>
#include <stdexcept>
#include <ostd/range.hh>
#include <ostd/algorithm.hh>
@ -711,6 +713,80 @@ struct ranged_traits<std::basic_string<T, TR, A> const> {
}
};
/* more UTF utilities beyond basic API */
namespace utf {
/** @addtogroup Strings
* @{
*/
/** @brief Thrown on UTF-8 decoding failure. */
struct utf_error: std::runtime_error {
using std::runtime_error::runtime_error;
};
namespace detail {
struct codepoint_range: input_range<codepoint_range> {
using range_category = forward_range_tag;
using value_type = char32_t;
using reference = char32_t;
using size_type = std::size_t;
codepoint_range() = delete;
codepoint_range(string_range r): p_range(r) {
if (r.empty()) {
p_current = -1;
} else {
advance();
}
}
bool empty() const { return (p_current < 0); }
void pop_front() {
if (p_current > 0 && p_range.empty()) {
p_current = -1;
return;
}
advance();
}
char32_t front() const {
return p_current;
}
private:
void advance() {
if (char32_t ret; !codepoint(p_range, ret)) {
/* range is unchanged */
p_current = -1;
throw utf_error{"UTF-8 decoding failed"};
} else {
p_current = ret;
}
}
string_range p_range;
std::int32_t p_current;
};
} /* namespace detail */
/** @brief Iterate over the code points of a string.
*
* The resulting range is ostd::forward_range_tag. The range will
* contain the code points of the given string. On error, which may
* be during any string advancement (the constructor or `pop_front()`),
* an ostd::utf_error is raised.
*/
inline auto iter_codes(string_range r) {
return detail::codepoint_range{r};
}
/** @} */
} /* namespace utf */
/* string literals */
inline namespace literals {

View File

@ -45,7 +45,7 @@ static inline bool codepoint_dec(string_range &r, char32_t &cret) {
ret = (ret << 6) | bch;
}
/* number of continuation bytes */
std::size_t n = sr.data() - r.data();
std::size_t n = sr.data() - r.data() - 1;
/* invalid sequence - too many continuation bits */
if (n > 3) {
return false;