Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.

boost/spirit/home/support/char_encoding/unicode/query.hpp

/*=============================================================================
    Copyright (c) 2001-2011 Joel de Guzman

    Distributed under the Boost Software License, Version 1.0. (See accompanying
    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

    Autogenerated by MultiStageTable.py (Unicode multi-stage
    table builder) (c) Peter Kankowski, 2008
==============================================================================*/
#if !defined(BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010)
#define BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010

#include <boost/cstdint.hpp>

# include "category_table.hpp"
# include "script_table.hpp"
# include "lowercase_table.hpp"
# include "uppercase_table.hpp"

namespace boost { namespace spirit { namespace ucd
{
    // This header provides Basic (Level 1) Unicode Support
    // See http://unicode.org/reports/tr18/ for details

    struct properties
    {
        // bit pattern: xxMMMCCC
        // MMM: major_category
        // CCC: category

        enum major_category
        {
            letter,
            mark,
            number,
            separator,
            other,
            punctuation,
            symbol
        };

        enum category
        {
            uppercase_letter = 0,   // [Lu] an uppercase letter
            lowercase_letter,       // [Ll] a lowercase letter
            titlecase_letter,       // [Lt] a digraphic character, with first part uppercase
            modifier_letter,        // [Lm] a modifier letter
            other_letter,           // [Lo] other letters, including syllables and ideographs

            nonspacing_mark = 8,    // [Mn] a nonspacing combining mark (zero advance width)
            enclosing_mark,         // [Me] an enclosing combining mark
            spacing_mark,           // [Mc] a spacing combining mark (positive advance width)

            decimal_number = 16,    // [Nd] a decimal digit
            letter_number,          // [Nl] a letterlike numeric character
            other_number,           // [No] a numeric character of other type

            space_separator = 24,   // [Zs] a space character (of various non-zero widths)
            line_separator,         // [Zl] U+2028 LINE SEPARATOR only
            paragraph_separator,    // [Zp] U+2029 PARAGRAPH SEPARATOR only

            control = 32,           // [Cc] a C0 or C1 control code
            format,                 // [Cf] a format control character
            private_use,            // [Co] a private-use character
            surrogate,              // [Cs] a surrogate code point
            unassigned,             // [Cn] a reserved unassigned code point or a noncharacter

            dash_punctuation = 40,  // [Pd] a dash or hyphen punctuation mark
            open_punctuation,       // [Ps] an opening punctuation mark (of a pair)
            close_punctuation,      // [Pe] a closing punctuation mark (of a pair)
            connector_punctuation,  // [Pc] a connecting punctuation mark, like a tie
            other_punctuation,      // [Po] a punctuation mark of other type
            initial_punctuation,    // [Pi] an initial quotation mark
            final_punctuation,      // [Pf] a final quotation mark

            math_symbol = 48,       // [Sm] a symbol of primarily mathematical use
            currency_symbol,        // [Sc] a currency sign
            modifier_symbol,        // [Sk] a non-letterlike modifier symbol
            other_symbol            // [So] a symbol of other type
        };

        enum derived_properties
        {
            alphabetic = 64,
            uppercase = 128,
            lowercase = 256,
            white_space = 512,
            hex_digit = 1024,
            noncharacter_code_point = 2048,
            default_ignorable_code_point = 4096
        };

        enum script
        {
            adlam,
            caucasian_albanian,
            ahom,
            arabic,
            imperial_aramaic,
            armenian,
            avestan,
            balinese,
            bamum,
            bassa_vah,
            batak,
            bengali,
            bhaiksuki,
            bopomofo,
            brahmi,
            braille,
            buginese,
            buhid,
            chakma,
            canadian_aboriginal,
            carian,
            cham,
            cherokee,
            chorasmian,
            coptic,
            cypro_minoan,
            cypriot,
            cyrillic,
            devanagari,
            dives_akuru,
            dogra,
            deseret,
            duployan,
            egyptian_hieroglyphs,
            elbasan,
            elymaic,
            ethiopic,
            georgian,
            glagolitic,
            gunjala_gondi,
            masaram_gondi,
            gothic,
            grantha,
            greek,
            gujarati,
            gurmukhi,
            hangul,
            han,
            hanunoo,
            hatran,
            hebrew,
            hiragana,
            anatolian_hieroglyphs,
            pahawh_hmong,
            nyiakeng_puachue_hmong,
            katakana_or_hiragana,
            old_hungarian,
            old_italic,
            javanese,
            kayah_li,
            katakana,
            kawi,
            kharoshthi,
            khmer,
            khojki,
            khitan_small_script,
            kannada,
            kaithi,
            tai_tham,
            lao,
            latin,
            lepcha,
            limbu,
            linear_a,
            linear_b,
            lisu,
            lycian,
            lydian,
            mahajani,
            makasar,
            mandaic,
            manichaean,
            marchen,
            medefaidrin,
            mende_kikakui,
            meroitic_cursive,
            meroitic_hieroglyphs,
            malayalam,
            modi,
            mongolian,
            mro,
            meetei_mayek,
            multani,
            myanmar,
            nag_mundari,
            nandinagari,
            old_north_arabian,
            nabataean,
            newa,
            nko,
            nushu,
            ogham,
            ol_chiki,
            old_turkic,
            oriya,
            osage,
            osmanya,
            old_uyghur,
            palmyrene,
            pau_cin_hau,
            old_permic,
            phags_pa,
            inscriptional_pahlavi,
            psalter_pahlavi,
            phoenician,
            miao,
            inscriptional_parthian,
            rejang,
            hanifi_rohingya,
            runic,
            samaritan,
            old_south_arabian,
            saurashtra,
            signwriting,
            shavian,
            sharada,
            siddham,
            khudawadi,
            sinhala,
            sogdian,
            old_sogdian,
            sora_sompeng,
            soyombo,
            sundanese,
            syloti_nagri,
            syriac,
            tagbanwa,
            takri,
            tai_le,
            new_tai_lue,
            tamil,
            tangut,
            tai_viet,
            telugu,
            tifinagh,
            tagalog,
            thaana,
            thai,
            tibetan,
            tirhuta,
            tangsa,
            toto,
            ugaritic,
            vai,
            vithkuqi,
            warang_citi,
            wancho,
            old_persian,
            cuneiform,
            yezidi,
            yi,
            zanabazar_square,
            inherited,
            common,
            unknown
        };
    };

    inline properties::category get_category(::boost::uint32_t ch)
    {
        return static_cast<properties::category>(detail::category_lookup(ch) & 0x3F);
    }

    inline properties::major_category get_major_category(::boost::uint32_t ch)
    {
        return static_cast<properties::major_category>(get_category(ch) >> 3);
    }

    inline bool is_punctuation(::boost::uint32_t ch)
    {
        return get_major_category(ch) == properties::punctuation;
    }

    inline bool is_decimal_number(::boost::uint32_t ch)
    {
        return get_category(ch) == properties::decimal_number;
    }

    inline bool is_hex_digit(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::hex_digit) != 0;
    }

    inline bool is_control(::boost::uint32_t ch)
    {
        return get_category(ch) == properties::control;
    }

    inline bool is_alphabetic(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::alphabetic) != 0;
    }

    inline bool is_alphanumeric(::boost::uint32_t ch)
    {
        return is_decimal_number(ch) || is_alphabetic(ch);
    }

    inline bool is_uppercase(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::uppercase) != 0;
    }

    inline bool is_lowercase(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::lowercase) != 0;
    }

    inline bool is_white_space(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::white_space) != 0;
    }

    inline bool is_blank(::boost::uint32_t ch)
    {
        switch (ch)
        {
            case '\n': case '\v': case '\f': case '\r':
                return false;
            default:
                return is_white_space(ch)
                && !(   get_category(ch) == properties::line_separator
                    ||  get_category(ch) == properties::paragraph_separator
                    );
        }
    }

    inline bool is_graph(::boost::uint32_t ch)
    {
        return !(   is_white_space(ch)
                ||  get_category(ch) == properties::control
                ||  get_category(ch) == properties::surrogate
                ||  get_category(ch) == properties::unassigned
                );
    }

    inline bool is_print(::boost::uint32_t ch)
    {
        return (is_graph(ch) || is_blank(ch)) && !is_control(ch);
    }

    inline bool is_noncharacter_code_point(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::noncharacter_code_point) != 0;
    }

    inline bool is_default_ignorable_code_point(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::default_ignorable_code_point) != 0;
    }

    inline properties::script get_script(::boost::uint32_t ch)
    {
        return static_cast<properties::script>(detail::script_lookup(ch));
    }

    inline ::boost::uint32_t to_lowercase(::boost::uint32_t ch)
    {
        // The table returns 0 to signal that this code maps to itself
        ::boost::uint32_t r = detail::lowercase_lookup(ch);
        return (r == 0)? ch : r;
    }

    inline ::boost::uint32_t to_uppercase(::boost::uint32_t ch)
    {
        // The table returns 0 to signal that this code maps to itself
        ::boost::uint32_t r = detail::uppercase_lookup(ch);
        return (r == 0)? ch : r;
    }
}}}

#endif