libostd/src/string.cc

/* String implementation details, mainly regarding Unicode support.
 *
 * This file is part of libostd. See COPYING.md for futher information.
 */

#include <cstdint>
#include <cstdlib>
#include "ostd/string.hh"

namespace ostd {
namespace utf {

constexpr std::uint32_t MaxCodepoint = 0x10FFFF;

namespace detail {
    static inline bool u8_decode(string_range &r, char32_t &cret) noexcept {
        static std::uint32_t const ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };
        if (r.empty()) {
            return false;
        }
        std::uint32_t ch = static_cast<unsigned char const>(r.front());
        if (ch <= 0x7F) {
            /* ASCII */
            cret = ch;
            r.pop_front();
            return true;
        }
        std::uint32_t ret = 0;
        string_range sr = r;
        sr.pop_front();
        /* continuation bytes */
        for (; ch & 0x40; ch <<= 1) {
            /* need a continuation byte but nothing left in the string */
            if (sr.empty()) {
                return false;
            }
            /* the continuation byte */
            std::uint32_t nch = static_cast<unsigned char const>(sr.front());
            sr.pop_front();
            /* lower 6 bits */
            std::uint32_t bch = nch & 0x3F;
            /* not a continuation byte */
            if ((nch ^ bch) != 0x80) {
                return false;
            }
            /* the 6 bits go in the result */
            ret = (ret << 6) | bch;
        }
        /* number of continuation bytes */
        std::size_t n = sr.data() - r.data() - 1;
        /* invalid sequence - too many continuation bits */
        if (n > 3) {
            return false;
        }
        /* add the up to 7 bits from the first byte, already shifted left by n */
        ret |= (ch & 0x7F) << (n * 5);
        /* invalid sequence - out of bounds */
        if ((ret > MaxCodepoint) || (ret <= ulim[n])) {
            return false;
        }
        /* invalid sequence - surrogate code point */
        if ((ret & 0xD800) == 0xD800) {
            return false;
        }
        cret = ret;
        r = sr;
        return true;
    }

    std::uint8_t u8_encode(
        std::uint8_t (&ret)[4], std::uint32_t ch
    ) noexcept {
        if (ch <= 0x7F) {
            ret[0] = ch;
            return 1;
        }
        if (ch <= 0x7FF) {
            ret[0] = 0xC0 | (ch >> 6);
            ret[1] = 0x80 | (ch & 0x3F);
            return 2;
        }
        if (ch <= 0xFFFF) {
            /* TODO: optional WTF-8 semantics
             * for now simply reject surrogate code points
             */
            if ((ch & 0xD800) == 0xD800) {
                return 0;
            }
            ret[0] = 0xE0 |  (ch >> 12);
            ret[1] = 0x80 | ((ch >> 6) & 0x3F);
            ret[2] = 0x80 |  (ch       & 0x3F);
            return 3;
        }
        if (ch <= MaxCodepoint) {
            ret[0] = 0xF0 |  (ch >> 18);
            ret[1] = 0x80 | ((ch >> 12) | 0x3F);
            ret[2] = 0x80 | ((ch >>  6) | 0x3F);
            ret[3] = 0x80 |  (ch        | 0x3F);
            return 4;
        }
        return 0;
    }
} /* namespace detail */

bool decode(string_range &r, char32_t &ret) noexcept {
    return detail::u8_decode(r, ret);
}

std::size_t length(string_range r, string_range &cont) noexcept {
    std::size_t ret = 0;
    for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) {
        continue;
    }
    cont = r;
    return ret;
}

/* unicode-aware ctype
 * the other ones use custom tables for lookups
 */

bool isalnum(char32_t c) {
    return (utf::isalpha(c) || utf::isdigit(c));
}

bool isblank(char32_t c) {
    return ((c == ' ') || (c == '\t'));
}

bool isgraph(char32_t c) {
    return (!utf::isspace(c) && utf::isprint(c));
}

bool isprint(char32_t c) {
    switch (std::uint32_t(c)) {
        case 0x2028:
        case 0x2029:
        case 0xFFF9:
        case 0xFFFA:
        case 0xFFFB:
            return false;
        default:
            return !utf::iscntrl(c);
    }
}

bool ispunct(char32_t c) {
    return (utf::isgraph(c) && !utf::isalnum(c));
}

bool isvalid(char32_t c) {
    /* surrogate code points */
    if ((c >= 0xD800) && (c <= 0xDFFF)) {
        return false;
    }
    /* non-characters */
    if ((c >= 0xFDD0) && (c <= 0xFDEF)) {
        return false;
    }
    /* end of plane */
    if ((c & 0xFFFE) == 0xFFFE) {
        return false;
    }
    /* must be within range */
    return (c <= MaxCodepoint);
}

bool isxdigit(char32_t c) {
    if ((c >= '0') && (c <= '9')) {
        return true;
    }
    auto uc = std::uint32_t(c) | 32;
    return ((uc >= 'a') && (uc <= 'f'));
}

inline int codepoint_cmp1(void const *a, void const *b) {
    char32_t c1 = *static_cast<char32_t const *>(a);
    char32_t c2 = *static_cast<char32_t const *>(b);
    return (c1 - c2);
}

inline int codepoint_cmp2(void const *a, void const *b) {
    char32_t        c = *static_cast<char32_t const *>(a);
    char32_t const *p =  static_cast<char32_t const *>(b);
    if ((c >= p[0]) && (c <= p[1])) {
        return 0;
    }
    return (c - p[0]);
}

template<
    std::size_t RangesN, std::size_t RangesS,
    std::size_t Laces1N, std::size_t Laces1S,
    std::size_t Laces2N, std::size_t Laces2S,
    std::size_t SinglesN, std::size_t SinglesS
>
struct uctype_func {
    static bool do_is(
        char32_t c,
        void const *ranges  [[maybe_unused]],
        void const *laces1  [[maybe_unused]],
        void const *laces2  [[maybe_unused]],
        void const *singles [[maybe_unused]]
    ) {
        if constexpr(RangesN != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, ranges, RangesN / RangesS, RangesS / sizeof(char32_t),
                codepoint_cmp2
            ));
            if (found) {
                return true;
            }
        }
        if constexpr(Laces1N != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, laces1, Laces1N / Laces1S, Laces1S / sizeof(char32_t),
                codepoint_cmp2
            ));
            if (found) {
                return !((c - found[0]) % 2);
            }
        }
        if constexpr(Laces2N != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, laces2, Laces2N / Laces2S, Laces2S / sizeof(char32_t),
                codepoint_cmp2
            ));
            if (found) {
                return !((c - found[0]) % 2);
            }
        }
        if constexpr(SinglesN != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, singles, SinglesN / SinglesS, SinglesS / sizeof(char32_t),
                codepoint_cmp1
            ));
            if (found) {
                return true;
            }
        }
        return false;
    }

    static char32_t do_to(
        char32_t c,
        void const *ranges  [[maybe_unused]],
        void const *laces1  [[maybe_unused]],
        void const *laces2  [[maybe_unused]],
        void const *singles [[maybe_unused]]
    ) {
        if constexpr(RangesN != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, ranges, RangesN >> 4, RangesN & 0xF, codepoint_cmp2
            ));
            if (found) {
                return (found[2] + (c - found[0]));
            }
        }
        if constexpr(Laces1N != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, laces1, Laces1N >> 4, Laces1N & 0xF, codepoint_cmp2
            ));
            if (found) {
                if ((c - found[0]) % 2) {
                    return c;
                }
                return c + 1;
            }
        }
        if constexpr(Laces2N != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, laces2, Laces2N >> 4, Laces2N & 0xF, codepoint_cmp2
            ));
            if (found) {
                if ((c - found[0]) % 2) {
                    return c;
                }
                return c - 1;
            }
        }
        if constexpr(SinglesN != 0) {
            char32_t *found = static_cast<char32_t *>(std::bsearch(
                &c, singles, SinglesN >> 4, SinglesN & 0xF, codepoint_cmp1
            ));
            if (found) {
                return found[1];
            }
        }
        return c;
    }
};

/* these are geneated */
bool isalpha(char32_t c);
bool iscntrl(char32_t c);
bool isdigit(char32_t c);
bool islower(char32_t c);
bool isspace(char32_t c);
bool istitle(char32_t c);
bool isupper(char32_t c);
char32_t tolower(char32_t c);
char32_t toupper(char32_t c);

#include "string_utf.hh"

} /* namespace utf */
} /* namespace ostd */
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`/* String implementation details, mainly regarding Unicode support.`
			`*`
			`* This file is part of libostd. See COPYING.md for futher information.`
			`*/`

			`#include <cstdint>`
implement all the unicode ctype funcs, generate the tables 2018-01-02 22:23:03 +01:00			`#include <cstdlib>`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`#include "ostd/string.hh"`

			`namespace ostd {`
			`namespace utf {`

			`constexpr std::uint32_t MaxCodepoint = 0x10FFFF;`

add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`namespace detail {`
			`static inline bool u8_decode(string_range &r, char32_t &cret) noexcept {`
			`static std::uint32_t const ulim[] = { 0xFF, 0x7F, 0x7FF, 0xFFFF };`
			`if (r.empty()) {`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`return false;`
			`}`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`std::uint32_t ch = static_cast<unsigned char const>(r.front());`
			`if (ch <= 0x7F) {`
			`/* ASCII */`
			`cret = ch;`
			`r.pop_front();`
			`return true;`
			`}`
			`std::uint32_t ret = 0;`
			`string_range sr = r;`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`sr.pop_front();`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`/* continuation bytes */`
			`for (; ch & 0x40; ch <<= 1) {`
			`/* need a continuation byte but nothing left in the string */`
			`if (sr.empty()) {`
			`return false;`
			`}`
			`/* the continuation byte */`
			`std::uint32_t nch = static_cast<unsigned char const>(sr.front());`
			`sr.pop_front();`
			`/* lower 6 bits */`
			`std::uint32_t bch = nch & 0x3F;`
			`/* not a continuation byte */`
			`if ((nch ^ bch) != 0x80) {`
			`return false;`
			`}`
			`/* the 6 bits go in the result */`
			`ret = (ret << 6) \| bch;`
			`}`
			`/* number of continuation bytes */`
			`std::size_t n = sr.data() - r.data() - 1;`
			`/* invalid sequence - too many continuation bits */`
			`if (n > 3) {`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`return false;`
			`}`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`/* add the up to 7 bits from the first byte, already shifted left by n */`
			`ret \|= (ch & 0x7F) << (n * 5);`
			`/* invalid sequence - out of bounds */`
			`if ((ret > MaxCodepoint) \|\| (ret <= ulim[n])) {`
			`return false;`
			`}`
reject surrogate code points in decoding 2018-01-01 02:36:05 +01:00			`/* invalid sequence - surrogate code point */`
			`if ((ret & 0xD800) == 0xD800) {`
			`return false;`
			`}`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`cret = ret;`
			`r = sr;`
			`return true;`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`}`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00
			`std::uint8_t u8_encode(`
			`std::uint8_t (&ret)[4], std::uint32_t ch`
			`) noexcept {`
			`if (ch <= 0x7F) {`
			`ret[0] = ch;`
			`return 1;`
			`}`
			`if (ch <= 0x7FF) {`
			`ret[0] = 0xC0 \| (ch >> 6);`
			`ret[1] = 0x80 \| (ch & 0x3F);`
			`return 2;`
			`}`
			`if (ch <= 0xFFFF) {`
			`/* TODO: optional WTF-8 semantics`
			`* for now simply reject surrogate code points`
			`*/`
			`if ((ch & 0xD800) == 0xD800) {`
			`return 0;`
			`}`
			`ret[0] = 0xE0 \| (ch >> 12);`
			`ret[1] = 0x80 \| ((ch >> 6) & 0x3F);`
			`ret[2] = 0x80 \| (ch & 0x3F);`
			`return 3;`
			`}`
			`if (ch <= MaxCodepoint) {`
			`ret[0] = 0xF0 \| (ch >> 18);`
			`ret[1] = 0x80 \| ((ch >> 12) \| 0x3F);`
			`ret[2] = 0x80 \| ((ch >> 6) \| 0x3F);`
			`ret[3] = 0x80 \| (ch \| 0x3F);`
			`return 4;`
			`}`
			`return 0;`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`}`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`} /* namespace detail */`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00
rename codepoint to decode 2017-12-31 20:06:36 +01:00			`bool decode(string_range &r, char32_t &ret) noexcept {`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`return detail::u8_decode(r, ret);`
expose multibyte-to-codepoint conversion 2017-12-31 03:26:15 +01:00			`}`

expose unicode stuff through string slices 2017-12-31 19:12:51 +01:00			`std::size_t length(string_range r, string_range &cont) noexcept {`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`std::size_t ret = 0;`
add a function to encode utf-32 to utf-8 2018-01-01 01:02:49 +01:00			`for (char32_t ch = U'\0'; detail::u8_decode(r, ch); ++ret) {`
add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`continue;`
			`}`
			`cont = r;`
			`return ret;`
			`}`

implement all the unicode ctype funcs, generate the tables 2018-01-02 22:23:03 +01:00			`/* unicode-aware ctype`
			`* the other ones use custom tables for lookups`
			`*/`

			`bool isalnum(char32_t c) {`
			`return (utf::isalpha(c) \|\| utf::isdigit(c));`
			`}`

			`bool isblank(char32_t c) {`
			`return ((c == ' ') \|\| (c == '\t'));`
			`}`

			`bool isgraph(char32_t c) {`
			`return (!utf::isspace(c) && utf::isprint(c));`
			`}`

			`bool isprint(char32_t c) {`
			`switch (std::uint32_t(c)) {`
			`case 0x2028:`
			`case 0x2029:`
			`case 0xFFF9:`
			`case 0xFFFA:`
			`case 0xFFFB:`
			`return false;`
			`default:`
			`return !utf::iscntrl(c);`
			`}`
			`}`

			`bool ispunct(char32_t c) {`
			`return (utf::isgraph(c) && !utf::isalnum(c));`
			`}`

			`bool isvalid(char32_t c) {`
			`/* surrogate code points */`
			`if ((c >= 0xD800) && (c <= 0xDFFF)) {`
			`return false;`
			`}`
			`/* non-characters */`
			`if ((c >= 0xFDD0) && (c <= 0xFDEF)) {`
			`return false;`
			`}`
			`/* end of plane */`
			`if ((c & 0xFFFE) == 0xFFFE) {`
			`return false;`
			`}`
			`/* must be within range */`
			`return (c <= MaxCodepoint);`
			`}`

			`bool isxdigit(char32_t c) {`
			`if ((c >= '0') && (c <= '9')) {`
			`return true;`
			`}`
			`auto uc = std::uint32_t(c) \| 32;`
			`return ((uc >= 'a') && (uc <= 'f'));`
			`}`

			`inline int codepoint_cmp1(void const a, void const b) {`
			`char32_t c1 = static_cast<char32_t const >(a);`
			`char32_t c2 = static_cast<char32_t const >(b);`
			`return (c1 - c2);`
			`}`

			`inline int codepoint_cmp2(void const a, void const b) {`
			`char32_t c = static_cast<char32_t const >(a);`
			`char32_t const p = static_cast<char32_t const >(b);`
			`if ((c >= p[0]) && (c <= p[1])) {`
			`return 0;`
			`}`
			`return (c - p[0]);`
			`}`

			`template<`
			`std::size_t RangesN, std::size_t RangesS,`
			`std::size_t Laces1N, std::size_t Laces1S,`
			`std::size_t Laces2N, std::size_t Laces2S,`
			`std::size_t SinglesN, std::size_t SinglesS`
			`>`
			`struct uctype_func {`
			`static bool do_is(`
			`char32_t c,`
			`void const *ranges [[maybe_unused]],`
			`void const *laces1 [[maybe_unused]],`
			`void const *laces2 [[maybe_unused]],`
			`void const *singles [[maybe_unused]]`
			`) {`
			`if constexpr(RangesN != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, ranges, RangesN / RangesS, RangesS / sizeof(char32_t),`
			`codepoint_cmp2`
			`));`
			`if (found) {`
			`return true;`
			`}`
			`}`
			`if constexpr(Laces1N != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, laces1, Laces1N / Laces1S, Laces1S / sizeof(char32_t),`
			`codepoint_cmp2`
			`));`
			`if (found) {`
			`return !((c - found[0]) % 2);`
			`}`
			`}`
			`if constexpr(Laces2N != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, laces2, Laces2N / Laces2S, Laces2S / sizeof(char32_t),`
			`codepoint_cmp2`
			`));`
			`if (found) {`
			`return !((c - found[0]) % 2);`
			`}`
			`}`
			`if constexpr(SinglesN != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, singles, SinglesN / SinglesS, SinglesS / sizeof(char32_t),`
			`codepoint_cmp1`
			`));`
			`if (found) {`
			`return true;`
			`}`
			`}`
			`return false;`
			`}`

			`static char32_t do_to(`
			`char32_t c,`
			`void const *ranges [[maybe_unused]],`
			`void const *laces1 [[maybe_unused]],`
			`void const *laces2 [[maybe_unused]],`
			`void const *singles [[maybe_unused]]`
			`) {`
			`if constexpr(RangesN != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, ranges, RangesN >> 4, RangesN & 0xF, codepoint_cmp2`
			`));`
			`if (found) {`
			`return (found[2] + (c - found[0]));`
			`}`
			`}`
			`if constexpr(Laces1N != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, laces1, Laces1N >> 4, Laces1N & 0xF, codepoint_cmp2`
			`));`
			`if (found) {`
			`if ((c - found[0]) % 2) {`
			`return c;`
			`}`
			`return c + 1;`
			`}`
			`}`
			`if constexpr(Laces2N != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, laces2, Laces2N >> 4, Laces2N & 0xF, codepoint_cmp2`
			`));`
			`if (found) {`
			`if ((c - found[0]) % 2) {`
			`return c;`
			`}`
			`return c - 1;`
			`}`
			`}`
			`if constexpr(SinglesN != 0) {`
			`char32_t found = static_cast<char32_t >(std::bsearch(`
			`&c, singles, SinglesN >> 4, SinglesN & 0xF, codepoint_cmp1`
			`));`
			`if (found) {`
			`return found[1];`
			`}`
			`}`
			`return c;`
			`}`
			`};`

			`/* these are geneated */`
			`bool isalpha(char32_t c);`
			`bool iscntrl(char32_t c);`
			`bool isdigit(char32_t c);`
			`bool islower(char32_t c);`
			`bool isspace(char32_t c);`
			`bool istitle(char32_t c);`
			`bool isupper(char32_t c);`
			`char32_t tolower(char32_t c);`
			`char32_t toupper(char32_t c);`

			`#include "string_utf.hh"`

add some initial code for upcoming unicode support 2017-12-31 03:00:23 +01:00			`} /* namespace utf */`
			`} /* namespace ostd */`