boost/parser/detail/text/transcode_algorithm.hpp
// Copyright (C) 2018 Robert N. Steagall
// Copyright (C) 2019 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_PARSER_DETAIL_TEXT_TRANSCODE_ALGORITHM_HPP
#define BOOST_PARSER_DETAIL_TEXT_TRANSCODE_ALGORITHM_HPP
#include <boost/parser/detail/text/in_out_result.hpp>
#include <boost/parser/detail/text/transcode_iterator.hpp>
#include <boost/parser/detail/text/unpack.hpp>
#if BOOST_PARSER_DETAIL_TEXT_USE_CONCEPTS
#include <algorithm>
#endif
#include <boost/parser/config.hpp>
namespace boost::parser::detail { namespace text {
/** An alias for `in_out_result` returned by algorithms that perform a
transcoding copy. */
template<typename Iter, typename OutIter>
using transcode_result = in_out_result<Iter, OutIter>;
namespace detail {
template<typename OutIter>
constexpr OutIter read_into_utf8_iter(uint32_t cp, OutIter out)
{
if (cp < 0x80) {
*out = static_cast<char>(cp);
++out;
} else if (cp < 0x800) {
*out = static_cast<char>(0xC0 + (cp >> 6));
++out;
*out = static_cast<char>(0x80 + (cp & 0x3f));
++out;
} else if (cp < 0x10000) {
*out = static_cast<char>(0xe0 + (cp >> 12));
++out;
*out = static_cast<char>(0x80 + ((cp >> 6) & 0x3f));
++out;
*out = static_cast<char>(0x80 + (cp & 0x3f));
++out;
} else {
*out = static_cast<char>(0xf0 + (cp >> 18));
++out;
*out = static_cast<char>(0x80 + ((cp >> 12) & 0x3f));
++out;
*out = static_cast<char>(0x80 + ((cp >> 6) & 0x3f));
++out;
*out = static_cast<char>(0x80 + (cp & 0x3f));
++out;
}
return out;
}
template<typename OutIter>
constexpr OutIter read_into_utf16_iter(uint32_t cp, OutIter out)
{
uint16_t const high_surrogate_base = 0xd7c0;
uint16_t const low_surrogate_base = 0xdc00;
if (cp < 0x10000) {
*out = static_cast<uint16_t>(cp);
++out;
} else {
*out = static_cast<uint16_t>(cp >> 10) + high_surrogate_base;
++out;
*out = static_cast<uint16_t>(cp & 0x3ff) + low_surrogate_base;
++out;
}
return out;
}
template<
bool UseN,
typename InputIter,
typename Sentinel,
typename OutIter>
transcode_result<InputIter, OutIter> transcode_utf_8_to_16(
InputIter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out,
std::input_iterator_tag)
{
for (; first != last && (!UseN || n); --n) {
unsigned char const c = *first;
if (c < 0x80) {
*out = *first;
++first;
++out;
} else {
auto const cp = detail::advance(first, last);
out = detail::read_into_utf16_iter(cp, out);
}
}
return {first, out};
}
template<bool UseN, typename Iter, typename OutIter>
transcode_result<Iter, OutIter> transcode_utf_8_to_16(
Iter first,
Iter last,
std::ptrdiff_t n,
OutIter out,
std::random_access_iterator_tag)
{
return transcode_utf_8_to_16<UseN>(
first, last, n, out, std::input_iterator_tag{});
}
template<
bool UseN,
typename InputIter,
typename Sentinel,
typename OutIter>
transcode_result<InputIter, OutIter> transcode_utf_8_to_32(
InputIter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out,
std::input_iterator_tag)
{
for (; first != last && (!UseN || n); --n) {
unsigned char const c = *first;
if (c < 0x80) {
*out = *first;
++first;
++out;
} else {
*out = detail::advance(first, last);
++out;
}
}
return {first, out};
}
template<bool UseN, typename Iter, typename OutIter>
transcode_result<Iter, OutIter> transcode_utf_8_to_32(
Iter first,
Iter last,
std::ptrdiff_t n,
OutIter out,
std::random_access_iterator_tag)
{
return transcode_utf_8_to_32<UseN>(
first, last, n, out, std::input_iterator_tag{});
}
template<format Tag>
struct tag_t
{};
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_8(
tag_t<format::utf8>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
for (; first != last && (!UseN || n); ++first, ++out) {
*out = *first;
--n;
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_16(
tag_t<format::utf8>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
return detail::transcode_utf_8_to_16<UseN>(
first,
last,
n,
out,
typename std::iterator_traits<Iter>::iterator_category{});
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_32(
tag_t<format::utf8>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
return detail::transcode_utf_8_to_32<UseN>(
first,
last,
n,
out,
typename std::iterator_traits<Iter>::iterator_category{});
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_8(
tag_t<format::utf16>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
uint32_t const high_surrogate_max = 0xdbff;
uint16_t const high_surrogate_base = 0xd7c0;
uint16_t const low_surrogate_base = 0xdc00;
for (; first != last && (!UseN || n); ++first, --n) {
uint32_t const hi = *first;
if (surrogate(hi)) {
if (hi <= high_surrogate_max) {
++first;
if (first == last) {
uint32_t const cp = replacement_character;
out = detail::read_into_utf8_iter(cp, out);
++out;
return {first, out};
}
uint32_t const lo = *first;
if (low_surrogate(lo)) {
uint32_t const cp =
((hi - high_surrogate_base) << 10) +
(lo - low_surrogate_base);
out = detail::read_into_utf8_iter(cp, out);
continue;
}
}
out = detail::read_into_utf8_iter(
replacement_character, out);
} else {
out = detail::read_into_utf8_iter(hi, out);
}
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_16(
tag_t<format::utf16>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
for (; first != last && (!UseN || n); ++first, ++out, --n) {
*out = *first;
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_32(
tag_t<format::utf16>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
uint32_t const high_surrogate_max = 0xdbff;
uint16_t const high_surrogate_base = 0xd7c0;
uint16_t const low_surrogate_base = 0xdc00;
for (; first != last && (!UseN || n); ++first, --n) {
uint32_t const hi = *first;
if (surrogate(hi)) {
if (hi <= high_surrogate_max) {
++first;
if (first == last) {
*out = replacement_character;
++out;
return {first, out};
}
uint32_t const lo = *first;
if (low_surrogate(lo)) {
uint32_t const cp =
((hi - high_surrogate_base) << 10) +
(lo - low_surrogate_base);
*out = cp;
++out;
continue;
}
}
*out = replacement_character;
++out;
} else {
*out = hi;
++out;
}
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_8(
tag_t<format::utf32>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
for (; first != last && (!UseN || n); ++first, --n) {
out = detail::read_into_utf8_iter(*first, out);
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_16(
tag_t<format::utf32>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
for (; first != last && (!UseN || n); ++first, --n) {
out = detail::read_into_utf16_iter(*first, out);
}
return {first, out};
}
template<bool UseN, typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_32(
tag_t<format::utf32>,
Iter first,
Sentinel last,
std::ptrdiff_t n,
OutIter out)
{
for (; first != last && (!UseN || n); ++first, ++out, --n) {
*out = *first;
}
return {first, out};
}
}
#if 0
/** Copies the code points in the range [first, last) to out, changing the
encoding from UTF-8 to UTF-32. */
template<typename InputIter, typename Sentinel, typename OutIter>
transcode_result<InputIter, OutIter> transcode_utf_8_to_32_take_n(
InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out)
{
auto const r = detail::unpack_iterator_and_sentinel(first, last);
return detail::transcode_to_32<true>(
detail::tag_t<r.format_tag>{}, r.first, r.last, n, out);
}
/** Copies the first `n` code points in the range [first, last) to out,
changing the encoding from UTF-8 to UTF-32. */
template<typename InputIter, typename Sentinel, typename OutIter>
transcode_result<InputIter, OutIter> transcode_utf_8_to_32_take_n(
InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out)
{
auto const r = detail::unpack_iterator_and_sentinel(first, last);
return detail::transcode_to_32<true>(
detail::tag_t<r.format_tag>{}, r.first, r.last, n, out);
}
/** Copies the first `n` code points in the range [first, last) to out,
changing the encoding from UTF-8 to UTF-32. */
template<typename InputIter, typename Sentinel, typename OutIter>
transcode_result<InputIter, OutIter>
transcode_utf_8_to_32_take_n(Range && r, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_utf_8_to_32_dispatch<true, Range, OutIter>::
call(r, n, out)
.out;
}
#endif
}}
namespace boost::parser::detail { namespace text { BOOST_PARSER_DETAIL_TEXT_NAMESPACE_V1 {
#if defined(BOOST_TEXT_DOXYGEN)
// -> utf8
/** Copies the code points in the range `[first, last)` to `out`, changing
the encoding to UTF-8. */
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<uint8_t> O>
requires(
utf16_code_unit<std::iter_value_t<I>> ||
utf32_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf8(I first, S last, O out);
/** Copies the code points in the range `[p, null_sentinel)` to `out`,
changing the encoding to UTF-8. */
template<typename Ptr, std::output_iterator<uint8_t> O>
requires(utf16_pointer<Ptr> || utf32_pointer<Ptr>)
transcode_result<Ptr, O> transcode_to_utf8(Ptr p, O out);
/** Copies the code points in the array `arr` to `out`, changing the
encoding to UTF-8. */
template<std::size_t N, typename Char, std::output_iterator<uint8_t> O>
requires (utf16_code_unit<Char> || utf32_code_unit<Char>)
transcode_result<Char *, O> transcode_to_utf8(Char (&arr)[N], O out);
/** Copies the code points in the range `r` to `out`, changing the
encoding to UTF-8. */
template<std::ranges::input_range R, std::output_iterator<uint8_t> O>
requires (utf16_code_unit<std::ranges::range_value_t<R>> ||
utf32_code_unit<std::ranges::range_value_t<R>>)
transcode_result<std::ranges::borrowed_iterator_t<R>, O>
transcode_to_utf8(R && r, O out);
// -> utf16
/** Copies the code points in the range `[first, last)` to `out`, changing
the encoding to UTF-16. */
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<char16_t> O>
requires (utf8_code_unit<std::iter_value_t<I>> ||
utf32_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf16(I first, S last, O out);
/** Copies the code points in the range `[p, null_sentinel)` to `out`,
changing the encoding to UTF-16. */
template<typename Ptr, std::output_iterator<char16_t> O>
requires (utf8_pointer<Ptr> || utf32_pointer<Ptr>)
transcode_result<Ptr, O> transcode_to_utf16(Ptr p, O out);
/** Copies the code points in the array `arr` to `out`, changing the
encoding to UTF-16. */
template<std::size_t N, typename Char, std::output_iterator<char16_t> O>
requires (utf8_code_unit<Char> || utf32_code_unit<Char>)
transcode_result<Char *, O> transcode_to_utf16(Char (&arr)[N], O out);
/** Copies the code points in the range `r` to `out`, changing the
encoding to UTF-16. */
template<std::ranges::input_range R, std::output_iterator<cjar16_t> O>
requires (utf8_code_unit<std::ranges::range_value_t<R>> ||
utf32_code_unit<std::ranges::range_value_t<R>>)
transcode_result<std::ranges::borrowed_iterator_t<R>, O>
transcode_to_utf16(R && r, O out);
// -> utf32
/** Copies the code points in the range `[first, last)` to `out`, changing
the encoding to UTF-32. */
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<uint32_t> O>
requires (utf8_code_unit<std::iter_value_t<I>> ||
utf16_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf32(I first, S last, O out);
/** Copies the code points in the range `[p, null_sentinel)` to `out`,
changing the encoding to UTF-32. */
template<typename Ptr, std::output_iterator<uint32_t> O>
requires (utf8_pointer<Ptr> || utf16_pointer<Ptr>)
transcode_result<Ptr, O> transcode_to_utf32(Ptr p, O out);
/** Copies the code points in the array `arr` to `out`, changing the
encoding to UTF-32. */
template<std::size_t N, typename Char, std::output_iterator<uint32_t> O>
requires (utf8_code_unit<Char> || utf16_code_unit<Char>)
transcode_result<Char *, O> transcode_to_utf32(Char (&arr)[N], O out);
/** Copies the code points in the range `r` to `out`, changing the
encoding to UTF-32. */
template<std::ranges::input_range R, std::output_iterator<uint32_t> O>
requires (utf8_code_unit<std::ranges::range_value_t<R>> ||
utf16_code_unit<std::ranges::range_value_t<R>>)
transcode_result<std::ranges::borrowed_iterator_t<R>, O>
transcode_to_utf32(R && r, O out);
#endif
namespace dtl {
template<
bool UseN,
typename Range,
typename OutIter,
bool _16Ptr = detail::is_16_ptr_v<Range>,
bool CPPtr = detail::is_cp_ptr_v<Range>>
struct transcode_to_8_dispatch
{
static constexpr auto
call(Range && r, std::ptrdiff_t n, OutIter out)
-> transcode_result<decltype(detail::begin(r)), OutIter>
{
auto const u = text::unpack_iterator_and_sentinel(
detail::begin(r), detail::end(r));
auto unpacked = detail::transcode_to_8<UseN>(
detail::tag_t<u.format_tag>{}, u.first, u.last, n, out);
return {u.repack(unpacked.in), unpacked.out};
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_8_dispatch<UseN, Ptr, OutIter, true, false>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_8<UseN>(
detail::tag_t<format::utf16>{}, p, null_sentinel, n, out);
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_8_dispatch<UseN, Ptr, OutIter, false, true>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_8<UseN>(
detail::tag_t<format::utf32>{}, p, null_sentinel, n, out);
}
};
template<
bool UseN,
typename Range,
typename OutIter,
bool CharPtr = detail::is_char_ptr_v<Range>,
bool CPPtr = detail::is_cp_ptr_v<Range>>
struct transcode_to_16_dispatch
{
static constexpr auto
call(Range && r, std::ptrdiff_t n, OutIter out)
-> transcode_result<decltype(detail::begin(r)), OutIter>
{
auto const u = text::unpack_iterator_and_sentinel(
detail::begin(r), detail::end(r));
auto unpacked = detail::transcode_to_16<UseN>(
detail::tag_t<u.format_tag>{}, u.first, u.last, n, out);
return {u.repack(unpacked.in), unpacked.out};
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_16_dispatch<UseN, Ptr, OutIter, true, false>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_16<UseN>(
detail::tag_t<format::utf8>{}, p, null_sentinel, n, out);
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_16_dispatch<UseN, Ptr, OutIter, false, true>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_16<UseN>(
detail::tag_t<format::utf32>{}, p, null_sentinel, n, out);
}
};
template<
bool UseN,
typename Range,
typename OutIter,
bool CharPtr = detail::is_char_ptr_v<Range>,
bool _16Ptr = detail::is_16_ptr_v<Range>>
struct transcode_to_32_dispatch
{
static constexpr auto
call(Range && r, std::ptrdiff_t n, OutIter out)
-> transcode_result<decltype(detail::begin(r)), OutIter>
{
auto const u = text::unpack_iterator_and_sentinel(
detail::begin(r), detail::end(r));
auto unpacked = detail::transcode_to_32<UseN>(
detail::tag_t<u.format_tag>{}, u.first, u.last, n, out);
return {u.repack(unpacked.in), unpacked.out};
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_32_dispatch<UseN, Ptr, OutIter, true, false>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_32<UseN>(
detail::tag_t<format::utf8>{}, p, null_sentinel, n, out);
}
};
template<bool UseN, typename Ptr, typename OutIter>
struct transcode_to_32_dispatch<UseN, Ptr, OutIter, false, true>
{
static constexpr auto
call(Ptr p, std::ptrdiff_t n, OutIter out)
{
return detail::transcode_to_32<UseN>(
detail::tag_t<format::utf16>{}, p, null_sentinel, n, out);
}
};
}
template<typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_utf8(
Iter first, Sentinel last, OutIter out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_8<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename Range, typename OutIter>
transcode_result<detail::iterator_t<Range>, OutIter>
transcode_to_utf8(Range && r, OutIter out)
{
return dtl::transcode_to_8_dispatch<false, Range, OutIter>::call(
r, -1, out);
}
template<typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_utf16(
Iter first, Sentinel last, OutIter out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_16<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename Range, typename OutIter>
transcode_result<detail::iterator_t<Range>, OutIter>
transcode_to_utf16(Range && r, OutIter out)
{
return dtl::transcode_to_16_dispatch<false, Range, OutIter>::call(
r, -1, out);
}
template<typename Iter, typename Sentinel, typename OutIter>
transcode_result<Iter, OutIter> transcode_to_utf32(
Iter first, Sentinel last, OutIter out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_32<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename Range, typename OutIter>
transcode_result<detail::iterator_t<Range>, OutIter>
transcode_to_utf32(Range && r, OutIter out)
{
return dtl::transcode_to_32_dispatch<false, Range, OutIter>::call(
r, -1, out);
}
}}}
#if BOOST_PARSER_DETAIL_TEXT_USE_CONCEPTS
namespace boost::parser::detail { namespace text { BOOST_PARSER_DETAIL_TEXT_NAMESPACE_V2 {
// -> utf8
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<uint8_t> O>
requires(
utf16_code_unit<std::iter_value_t<I>> ||
utf32_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf8(I first, S last, O out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_8<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename R, std::output_iterator<uint32_t> O>
requires(utf16_range<R> || utf32_range<R>)
transcode_result<dtl::uc_result_iterator<R>, O> transcode_to_utf8(
R && r, O out)
{
return text::transcode_to_utf8(
std::ranges::begin(r), std::ranges::end(r), out);
}
// -> utf16
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<char16_t> O>
requires(
utf8_code_unit<std::iter_value_t<I>> ||
utf32_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf16(I first, S last, O out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_16<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename R, std::output_iterator<uint32_t> O>
requires(utf8_range<R> || utf32_range<R>)
transcode_result<dtl::uc_result_iterator<R>, O> transcode_to_utf16(
R && r, O out)
{
return text::transcode_to_utf16(
std::ranges::begin(r), std::ranges::end(r), out);
}
// -> utf32
template<
std::input_iterator I,
std::sentinel_for<I> S,
std::output_iterator<uint32_t> O>
requires(
utf8_code_unit<std::iter_value_t<I>> ||
utf16_code_unit<std::iter_value_t<I>>)
transcode_result<I, O> transcode_to_utf32(I first, S last, O out)
{
auto const r = text::unpack_iterator_and_sentinel(first, last);
auto unpacked = detail::transcode_to_32<false>(
detail::tag_t<r.format_tag>{}, r.first, r.last, -1, out);
return {r.repack(unpacked.in), unpacked.out};
}
template<typename R, std::output_iterator<uint32_t> O>
requires(utf8_range<R> || utf16_range<R>)
transcode_result<dtl::uc_result_iterator<R>, O> transcode_to_utf32(
R && r, O out)
{
return text::transcode_to_utf32(
std::ranges::begin(r), std::ranges::end(r), out);
}
}}}
#endif
#endif