Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.

boost/json/detail/utf8.hpp

//
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/boostorg/json
//

#ifndef BOOST_JSON_DETAIL_UTF8_HPP
#define BOOST_JSON_DETAIL_UTF8_HPP

#include <boost/endian/conversion.hpp>
#include <boost/json/detail/config.hpp>

#include <cstddef>
#include <cstring>
#include <cstdint>

namespace boost {
namespace json {
namespace detail {

template<int N>
std::uint32_t
load_little_endian(void const* p)
{
    std::uint32_t v = 0;
    std::memcpy(&v, p, N);
    endian::little_to_native_inplace(v);
    return v;
}

inline
uint16_t
classify_utf8(char c)
{
    // 0x000 = invalid
    // 0x102 = 2 bytes, second byte [80, BF]
    // 0x203 = 3 bytes, second byte [A0, BF]
    // 0x303 = 3 bytes, second byte [80, BF]
    // 0x403 = 3 bytes, second byte [80, 9F]
    // 0x504 = 4 bytes, second byte [90, BF]
    // 0x604 = 4 bytes, second byte [80, BF]
    // 0x704 = 4 bytes, second byte [80, 8F]
    static constexpr uint16_t first[128]
    {
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,

       0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
       0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
       0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
       0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
    };
    return first[static_cast<unsigned char>(c & 0x7F)];
}

inline
bool
is_valid_utf8(const char* p, uint16_t first)
{
    uint32_t v;
    switch(first >> 8)
    {
    default:
        return false;

    // 2 bytes, second byte [80, BF]
    case 1:
        v = load_little_endian<2>(p);
        return (v & 0xC000) == 0x8000;

    // 3 bytes, second byte [A0, BF]
    case 2:
        v = load_little_endian<3>(p);
        return (v & 0xC0E000) == 0x80A000;

    // 3 bytes, second byte [80, BF]
    case 3:
        v = load_little_endian<3>(p);
        return (v & 0xC0C000) == 0x808000;

    // 3 bytes, second byte [80, 9F]
    case 4:
        v = load_little_endian<3>(p);
        return (v & 0xC0E000) == 0x808000;

    // 4 bytes, second byte [90, BF]
    case 5:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;

    // 4 bytes, second byte [80, BF]
    case 6:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0C000) == 0x80808000;

    // 4 bytes, second byte [80, 8F]
    case 7:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0F000) == 0x80808000;
    }
}

class utf8_sequence
{
    char seq_[4];
    uint16_t first_;
    uint8_t size_;

public:
    void
    save(
        const char* p,
        std::size_t remain) noexcept
    {
        first_ = classify_utf8(*p );
        if(remain >= length())
            size_ = length();
        else
            size_ = static_cast<uint8_t>(remain);
        std::memcpy(seq_, p, size_);
    }

    uint8_t
    length() const noexcept
    {
        return first_ & 0xFF;
    }

    bool
    complete() const noexcept
    {
        return size_ >= length();
    }

    // returns true if complete
    bool
    append(
        const char* p,
        std::size_t remain) noexcept
    {
        if(BOOST_JSON_UNLIKELY(needed() == 0))
            return true;
        if(BOOST_JSON_LIKELY(remain >= needed()))
        {
            std::memcpy(
                seq_ + size_, p, needed());
            size_ = length();
            return true;
        }
        if(BOOST_JSON_LIKELY(remain > 0))
        {
            std::memcpy(seq_ + size_, p, remain);
            size_ += static_cast<uint8_t>(remain);
        }
        return false;
    }

    const char*
    data() const noexcept
    {
        return seq_;
    }

    uint8_t
    needed() const noexcept
    {
        return length() - size_;
    }

    bool
    valid() const noexcept
    {
        BOOST_ASSERT(size_ >= length());
        return is_valid_utf8(seq_, first_);
    }
};

} // detail
} // namespace json
} // namespace boost

#endif