boost/json/detail/utf8.hpp
//
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/boostorg/json
//
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
#define BOOST_JSON_DETAIL_UTF8_HPP
#include <boost/endian/conversion.hpp>
#include <boost/json/detail/config.hpp>
#include <cstddef>
#include <cstring>
#include <cstdint>
namespace boost {
namespace json {
namespace detail {
template<int N>
std::uint32_t
load_little_endian(void const* p)
{
std::uint32_t v = 0;
std::memcpy(&v, p, N);
endian::little_to_native_inplace(v);
return v;
}
inline
uint16_t
classify_utf8(char c)
{
// 0x000 = invalid
// 0x102 = 2 bytes, second byte [80, BF]
// 0x203 = 3 bytes, second byte [A0, BF]
// 0x303 = 3 bytes, second byte [80, BF]
// 0x403 = 3 bytes, second byte [80, 9F]
// 0x504 = 4 bytes, second byte [90, BF]
// 0x604 = 4 bytes, second byte [80, BF]
// 0x704 = 4 bytes, second byte [80, 8F]
static constexpr uint16_t first[128]
{
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
};
return first[static_cast<unsigned char>(c & 0x7F)];
}
inline
bool
is_valid_utf8(const char* p, uint16_t first)
{
uint32_t v;
switch(first >> 8)
{
default:
return false;
// 2 bytes, second byte [80, BF]
case 1:
v = load_little_endian<2>(p);
return (v & 0xC000) == 0x8000;
// 3 bytes, second byte [A0, BF]
case 2:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x80A000;
// 3 bytes, second byte [80, BF]
case 3:
v = load_little_endian<3>(p);
return (v & 0xC0C000) == 0x808000;
// 3 bytes, second byte [80, 9F]
case 4:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x808000;
// 4 bytes, second byte [90, BF]
case 5:
v = load_little_endian<4>(p);
return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
// 4 bytes, second byte [80, BF]
case 6:
v = load_little_endian<4>(p);
return (v & 0xC0C0C000) == 0x80808000;
// 4 bytes, second byte [80, 8F]
case 7:
v = load_little_endian<4>(p);
return (v & 0xC0C0F000) == 0x80808000;
}
}
class utf8_sequence
{
char seq_[4];
uint16_t first_;
uint8_t size_;
public:
void
save(
const char* p,
std::size_t remain) noexcept
{
first_ = classify_utf8(*p );
if(remain >= length())
size_ = length();
else
size_ = static_cast<uint8_t>(remain);
std::memcpy(seq_, p, size_);
}
uint8_t
length() const noexcept
{
return first_ & 0xFF;
}
bool
complete() const noexcept
{
return size_ >= length();
}
// returns true if complete
bool
append(
const char* p,
std::size_t remain) noexcept
{
if(BOOST_JSON_UNLIKELY(needed() == 0))
return true;
if(BOOST_JSON_LIKELY(remain >= needed()))
{
std::memcpy(
seq_ + size_, p, needed());
size_ = length();
return true;
}
if(BOOST_JSON_LIKELY(remain > 0))
{
std::memcpy(seq_ + size_, p, remain);
size_ += static_cast<uint8_t>(remain);
}
return false;
}
const char*
data() const noexcept
{
return seq_;
}
uint8_t
needed() const noexcept
{
return length() - size_;
}
bool
valid() const noexcept
{
BOOST_ASSERT(size_ >= length());
return is_valid_utf8(seq_, first_);
}
};
} // detail
} // namespace json
} // namespace boost
#endif