Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.

boost/xpressive/regex_compiler.hpp

///////////////////////////////////////////////////////////////////////////////
/// \file regex_compiler.hpp
/// Contains the definition of regex_compiler, a factory for building regex objects
/// from strings.
//
//  Copyright 2007 Eric Niebler. Distributed under the Boost
//  Software License, Version 1.0. (See accompanying file
//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
#define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005

// MS compatible compilers support #pragma once
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
# pragma once
#endif

#include <map>
#include <boost/assert.hpp>
#include <boost/next_prior.hpp>
#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>
#include <boost/mpl/assert.hpp>
#include <boost/throw_exception.hpp>
#include <boost/type_traits/is_same.hpp>
#include <boost/iterator/iterator_traits.hpp>
#include <boost/xpressive/basic_regex.hpp>
#include <boost/xpressive/detail/dynamic/parser.hpp>
#include <boost/xpressive/detail/dynamic/parse_charset.hpp>
#include <boost/xpressive/detail/dynamic/parser_enum.hpp>
#include <boost/xpressive/detail/dynamic/parser_traits.hpp>
#include <boost/xpressive/detail/core/linker.hpp>
#include <boost/xpressive/detail/core/optimize.hpp>

namespace boost { namespace xpressive
{

///////////////////////////////////////////////////////////////////////////////
// regex_compiler
//
/// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
///
/// Class template regex_compiler is used to construct a basic_regex object from a string. The string
/// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
/// after which all basic_regex objects created with that regex_compiler object will use that locale.
/// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
/// compile() method to construct a basic_regex object, passing it the string representing the regular
/// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
/// objects compiled from the same string will have different regex_id's.
template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
struct regex_compiler
{
    typedef BidiIter iterator_type;
    typedef typename iterator_value<BidiIter>::type char_type;
    typedef regex_constants::syntax_option_type flag_type;
    typedef RegexTraits traits_type;
    typedef typename traits_type::string_type string_type;
    typedef typename traits_type::locale_type locale_type;
    typedef typename traits_type::char_class_type char_class_type;

    explicit regex_compiler(RegexTraits const &traits = RegexTraits())
      : mark_count_(0)
      , hidden_mark_count_(0)
      , traits_(traits)
      , upper_(0)
      , self_()
      , rules_()
    {
        this->upper_ = lookup_classname(this->rxtraits(), "upper");
    }

    ///////////////////////////////////////////////////////////////////////////
    // imbue
    /// Specify the locale to be used by a regex_compiler.
    ///
    /// \param loc The locale that this regex_compiler should use.
    /// \return The previous locale.
    locale_type imbue(locale_type loc)
    {
        locale_type oldloc = this->traits_.imbue(loc);
        this->upper_ = lookup_classname(this->rxtraits(), "upper");
        return oldloc;
    }

    ///////////////////////////////////////////////////////////////////////////
    // getloc
    /// Get the locale used by a regex_compiler.
    ///
    /// \return The locale used by this regex_compiler.
    locale_type getloc() const
    {
        return this->traits_.getloc();
    }

    ///////////////////////////////////////////////////////////////////////////
    // compile
    /// Builds a basic_regex object from a range of characters.
    ///
    /// \param  begin The beginning of a range of characters representing the
    ///         regular expression to compile.
    /// \param  end The end of a range of characters representing the
    ///         regular expression to compile.
    /// \param  flags Optional bitmask that determines how the pat string is
    ///         interpreted. (See syntax_option_type.)
    /// \return A basic_regex object corresponding to the regular expression
    ///         represented by the character range.
    /// \pre    InputIter is a model of the InputIterator concept.
    /// \pre    [begin,end) is a valid range.
    /// \pre    The range of characters specified by [begin,end) contains a 
    ///         valid string-based representation of a regular expression.
    /// \throw  regex_error when the range of characters has invalid regular
    ///         expression syntax.
    template<typename InputIter>
    basic_regex<BidiIter> compile(InputIter begin, InputIter end, flag_type flags = regex_constants::ECMAScript)
    {
        typedef typename iterator_category<InputIter>::type category;
        return this->compile_(begin, end, flags, category());
    }

    /// \overload
    ///
    template<typename InputRange>
    basic_regex<BidiIter> compile(InputRange const &pat, flag_type flags = regex_constants::ECMAScript)
    {
        return this->compile(boost::begin(pat), boost::end(pat), flags);
    }

    /// \overload
    ///
    basic_regex<BidiIter> compile(char_type const *begin, flag_type flags = regex_constants::ECMAScript)
    {
        BOOST_ASSERT(0 != begin);
        char_type const *end = begin + std::char_traits<char_type>::length(begin);
        return this->compile(begin, end, flags);
    }

    /// \overload
    ///
    basic_regex<BidiIter> compile(char_type const *begin, std::size_t size, flag_type flags)
    {
        BOOST_ASSERT(0 != begin);
        char_type const *end = begin + size;
        return this->compile(begin, end, flags);
    }

    ///////////////////////////////////////////////////////////////////////////
    // operator[]
    /// Return a reference to the named regular expression. If no such named
    /// regular expression exists, create a new regular expression and return
    /// a reference to it.
    ///
    /// \param  name A std::string containing the name of the regular expression.
    /// \pre    The string is not empty.
    /// \throw  bad_alloc on allocation failure.
    basic_regex<BidiIter> &operator [](string_type const &name)
    {
        BOOST_ASSERT(!name.empty());
        return this->rules_[name];
    }

    /// \overload
    ///
    basic_regex<BidiIter> const &operator [](string_type const &name) const
    {
        BOOST_ASSERT(!name.empty());
        return this->rules_[name];
    }

private:

    typedef detail::escape_value<char_type, char_class_type> escape_value;
    typedef detail::alternate_matcher<detail::alternates_vector<BidiIter>, RegexTraits> alternate_matcher;

    ///////////////////////////////////////////////////////////////////////////
    // compile_
    /// INTERNAL ONLY
    template<typename FwdIter>
    basic_regex<BidiIter> compile_(FwdIter begin, FwdIter end, flag_type flags, std::forward_iterator_tag)
    {
        BOOST_MPL_ASSERT((is_same<char_type, typename iterator_value<FwdIter>::type>));
        using namespace regex_constants;
        this->reset();
        this->traits_.flags(flags);

        basic_regex<BidiIter> rextmp, *prex = &rextmp;
        FwdIter tmp = begin;

        // Check if this regex is a named rule:
        string_type name;
        if(token_group_begin == this->traits_.get_token(tmp, end) &&
           detail::ensure(tmp != end, error_paren, "mismatched parenthesis") &&
           token_rule_assign == this->traits_.get_group_type(tmp, end, name))
        {
            begin = tmp;
            detail::ensure
            (
                begin != end && token_group_end == this->traits_.get_token(begin, end)
              , error_paren
              , "mismatched parenthesis"
            );
            prex = &this->rules_[name];
        }

        this->self_ = detail::core_access<BidiIter>::get_regex_impl(*prex);

        // at the top level, a regex is a sequence of alternates
        detail::sequence<BidiIter> seq = this->parse_alternates(begin, end);
        detail::ensure(begin == end, error_paren, "mismatched parenthesis");

        // terminate the sequence
        seq += detail::make_dynamic<BidiIter>(detail::end_matcher());

        // bundle the regex information into a regex_impl object
        detail::common_compile(seq.xpr().matchable(), *this->self_, this->rxtraits());

        this->self_->traits_ = new detail::traits_holder<RegexTraits>(this->rxtraits());
        this->self_->mark_count_ = this->mark_count_;
        this->self_->hidden_mark_count_ = this->hidden_mark_count_;

        // References changed, update dependencies.
        this->self_->tracking_update();
        this->self_.reset();
        return *prex;
    }

    ///////////////////////////////////////////////////////////////////////////
    // compile_
    /// INTERNAL ONLY
    template<typename InputIter>
    basic_regex<BidiIter> compile_(InputIter begin, InputIter end, flag_type flags, std::input_iterator_tag)
    {
        string_type pat(begin, end);
        return this->compile_(boost::begin(pat), boost::end(pat), flags, std::forward_iterator_tag());
    }

    ///////////////////////////////////////////////////////////////////////////
    // reset
    /// INTERNAL ONLY
    void reset()
    {
        this->mark_count_ = 0;
        this->hidden_mark_count_ = 0;
        this->traits_.flags(regex_constants::ECMAScript);
    }

    ///////////////////////////////////////////////////////////////////////////
    // regex_traits
    /// INTERNAL ONLY
    traits_type &rxtraits()
    {
        return this->traits_.traits();
    }

    ///////////////////////////////////////////////////////////////////////////
    // regex_traits
    /// INTERNAL ONLY
    traits_type const &rxtraits() const
    {
        return this->traits_.traits();
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_alternates
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_alternates(FwdIter &begin, FwdIter end)
    {
        using namespace regex_constants;
        int count = 0;
        FwdIter tmp = begin;
        detail::sequence<BidiIter> seq;

        do switch(++count)
        {
        case 1:
            seq = this->parse_sequence(tmp, end);
            break;
        case 2:
            seq = detail::make_dynamic<BidiIter>(alternate_matcher()) | seq;
            // fall-through
        default:
            seq |= this->parse_sequence(tmp, end);
        }
        while((begin = tmp) != end && token_alternate == this->traits_.get_token(tmp, end));

        return seq;
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_group
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_group(FwdIter &begin, FwdIter end)
    {
        using namespace regex_constants;
        int mark_nbr = 0;
        bool keeper = false;
        bool lookahead = false;
        bool lookbehind = false;
        bool negative = false;
        string_type name;

        detail::sequence<BidiIter> seq, seq_end;
        FwdIter tmp = FwdIter();

        syntax_option_type old_flags = this->traits_.flags();

        switch(this->traits_.get_group_type(begin, end, name))
        {
        case token_no_mark:
            // Don't process empty groups like (?:) or (?i)
            // BUGBUG this doesn't handle the degenerate (?:)+ correctly
            if(token_group_end == this->traits_.get_token(tmp = begin, end))
            {
                return this->parse_atom(begin = tmp, end);
            }
            break;

        case token_negative_lookahead:
            negative = true; // fall-through
        case token_positive_lookahead:
            lookahead = true;
            seq_end = detail::make_dynamic<BidiIter>(detail::true_matcher());
            break;

        case token_negative_lookbehind:
            negative = true; // fall-through
        case token_positive_lookbehind:
            lookbehind = true;
            seq_end = detail::make_dynamic<BidiIter>(detail::true_matcher());
            break;

        case token_independent_sub_expression:
            keeper = true;
            seq_end = detail::make_dynamic<BidiIter>(detail::true_matcher());
            break;

        case token_comment:
            while(detail::ensure(begin != end, error_paren, "mismatched parenthesis"))
            {
                switch(this->traits_.get_token(begin, end))
                {
                case token_group_end: return this->parse_atom(begin, end);
                case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
                case token_literal: ++begin;
                default:;
                }
            }
            break;

        case token_recurse:
            detail::ensure
            (
                begin != end && token_group_end == this->traits_.get_token(begin, end)
              , error_paren
              , "mismatched parenthesis"
            );
            return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(this->self_));

        case token_rule_assign:
            boost::throw_exception(
                regex_error(error_badrule, "rule assignments must be at the front of the regex")
            );
            break;

        case token_rule_ref:
            {
                typedef detail::core_access<BidiIter> access;
                detail::ensure
                (
                    begin != end && token_group_end == this->traits_.get_token(begin, end)
                  , error_paren
                  , "mismatched parenthesis"
                );
                basic_regex<BidiIter> &rex = this->rules_[name];
                shared_ptr<detail::regex_impl<BidiIter> > impl = access::get_regex_impl(rex);
                this->self_->track_reference(*impl);
                return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(impl));
            }

        case token_named_mark:
            mark_nbr = static_cast<int>(++this->mark_count_);
            for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i)
            {
                detail::ensure(this->self_->named_marks_[i].name_ != name, error_badmark, "named mark already exists");
            }
            this->self_->named_marks_.push_back(detail::named_mark<char_type>(name, this->mark_count_));
            seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr));
            seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr));
            break;

        case token_named_mark_ref:
            detail::ensure
            (
                begin != end && token_group_end == this->traits_.get_token(begin, end)
              , error_paren
              , "mismatched parenthesis"
            );
            for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i)
            {
                if(this->self_->named_marks_[i].name_ == name)
                {
                    mark_nbr = static_cast<int>(this->self_->named_marks_[i].mark_nbr_);
                    return detail::make_backref_xpression<BidiIter>
                    (
                        mark_nbr, this->traits_.flags(), this->rxtraits()
                    );
                }
            }
            boost::throw_exception(regex_error(error_badmark, "invalid named back-reference"));
            break;

        default:
            mark_nbr = static_cast<int>(++this->mark_count_);
            seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr));
            seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr));
            break;
        }

        // alternates
        seq += this->parse_alternates(begin, end);
        seq += seq_end;
        detail::ensure
        (
            begin != end && token_group_end == this->traits_.get_token(begin, end)
          , error_paren
          , "mismatched parenthesis"
        );

        typedef detail::shared_matchable<BidiIter> xpr_type;
        if(lookahead)
        {
            detail::lookahead_matcher<xpr_type> lookahead(seq.xpr(), negative, seq.pure());
            seq = detail::make_dynamic<BidiIter>(lookahead);
        }
        else if(lookbehind)
        {
            detail::lookbehind_matcher<xpr_type> lookbehind(seq.xpr(), seq.width().value(), negative, seq.pure());
            seq = detail::make_dynamic<BidiIter>(lookbehind);
        }
        else if(keeper) // independent sub-expression
        {
            detail::keeper_matcher<xpr_type> keeper(seq.xpr(), seq.pure());
            seq = detail::make_dynamic<BidiIter>(keeper);
        }

        // restore the modifiers
        this->traits_.flags(old_flags);
        return seq;
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_charset
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_charset(FwdIter &begin, FwdIter end)
    {
        detail::compound_charset<traits_type> chset;

        // call out to a helper to actually parse the character set
        detail::parse_charset(begin, end, chset, this->traits_);

        return detail::make_charset_xpression<BidiIter>
        (
            chset
          , this->rxtraits()
          , this->traits_.flags()
        );
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_atom
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_atom(FwdIter &begin, FwdIter end)
    {
        using namespace regex_constants;
        escape_value esc = { 0, 0, 0, detail::escape_char };
        FwdIter old_begin = begin;

        switch(this->traits_.get_token(begin, end))
        {
        case token_literal:
            return detail::make_literal_xpression<BidiIter>
            (
                this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
            );

        case token_any:
            return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());

        case token_assert_begin_sequence:
            return detail::make_dynamic<BidiIter>(detail::assert_bos_matcher());

        case token_assert_end_sequence:
            return detail::make_dynamic<BidiIter>(detail::assert_eos_matcher());

        case token_assert_begin_line:
            return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());

        case token_assert_end_line:
            return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());

        case token_assert_word_boundary:
            return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits());

        case token_assert_not_word_boundary:
            return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits());

        case token_assert_word_begin:
            return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());

        case token_assert_word_end:
            return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());

        case token_escape:
            esc = this->parse_escape(begin, end);
            switch(esc.type_)
            {
            case detail::escape_mark:
                return detail::make_backref_xpression<BidiIter>
                (
                    esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
                );
            case detail::escape_char:
                return detail::make_char_xpression<BidiIter>
                (
                    esc.ch_, this->traits_.flags(), this->rxtraits()
                );
            case detail::escape_class:
                return detail::make_posix_charset_xpression<BidiIter>
                (
                    esc.class_
                  , this->is_upper_(*begin++)
                  , this->traits_.flags()
                  , this->rxtraits()
                );
            }

        case token_group_begin:
            return this->parse_group(begin, end);

        case token_charset_begin:
            return this->parse_charset(begin, end);

        case token_invalid_quantifier:
            boost::throw_exception(regex_error(error_badrepeat, "quantifier not expected"));
            break;

        case token_quote_meta_begin:
            return detail::make_literal_xpression<BidiIter>
            (
                this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
            );

        case token_quote_meta_end:
            boost::throw_exception(
                regex_error(
                    error_escape
                  , "found quote-meta end without corresponding quote-meta begin"
                )
            );
            break;

        case token_end_of_pattern:
            break;

        default:
            begin = old_begin;
            break;
        }

        return detail::sequence<BidiIter>();
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_quant
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_quant(FwdIter &begin, FwdIter end)
    {
        BOOST_ASSERT(begin != end);
        detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ };
        detail::sequence<BidiIter> seq = this->parse_atom(begin, end);

        // BUGBUG this doesn't handle the degenerate (?:)+ correctly
        if(!seq.empty() && begin != end && detail::quant_none != seq.quant())
        {
            if(this->traits_.get_quant_spec(begin, end, spec))
            {
                BOOST_ASSERT(spec.min_ <= spec.max_);

                if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing.
                {
                    seq = this->parse_quant(begin, end);
                }
                else
                {
                    seq.repeat(spec);
                }
            }
        }

        return seq;
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_sequence
    /// INTERNAL ONLY
    template<typename FwdIter>
    detail::sequence<BidiIter> parse_sequence(FwdIter &begin, FwdIter end)
    {
        detail::sequence<BidiIter> seq;

        while(begin != end)
        {
            detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);

            // did we find a quantified atom?
            if(seq_quant.empty())
                break;

            // chain it to the end of the xpression sequence
            seq += seq_quant;
        }

        return seq;
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_literal
    //  scan ahead looking for char literals to be globbed together into a string literal
    /// INTERNAL ONLY
    template<typename FwdIter>
    string_type parse_literal(FwdIter &begin, FwdIter end)
    {
        using namespace regex_constants;
        BOOST_ASSERT(begin != end);
        BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
        escape_value esc = { 0, 0, 0, detail::escape_char };
        string_type literal(1, *begin);

        for(FwdIter prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
        {
            detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ };
            if(this->traits_.get_quant_spec(tmp, end, spec))
            {
                if(literal.size() != 1)
                {
                    begin = prev;
                    literal.erase(boost::prior(literal.end()));
                }
                return literal;
            }
            else switch(this->traits_.get_token(tmp, end))
            {
            case token_escape:
                esc = this->parse_escape(tmp, end);
                if(detail::escape_char != esc.type_) return literal;
                literal.insert(literal.end(), esc.ch_);
                break;
            case token_literal:
                literal.insert(literal.end(), *tmp++);
                break;
            default:
                return literal;
            }
        }

        return literal;
    }

    ///////////////////////////////////////////////////////////////////////////
    // parse_quote_meta
    //  scan ahead looking for char literals to be globbed together into a string literal
    /// INTERNAL ONLY
    template<typename FwdIter>
    string_type parse_quote_meta(FwdIter &begin, FwdIter end)
    {
        using namespace regex_constants;
        FwdIter old_begin = begin, old_end;
        while(end != (old_end = begin))
        {
            switch(this->traits_.get_token(begin, end))
            {
            case token_quote_meta_end: return string_type(old_begin, old_end);
            case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
            case token_literal: ++begin;
            default:;
            }
        }
        return string_type(old_begin, begin);
    }

    ///////////////////////////////////////////////////////////////////////////////
    // parse_escape
    /// INTERNAL ONLY
    template<typename FwdIter>
    escape_value parse_escape(FwdIter &begin, FwdIter end)
    {
        detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence");

        // first, check to see if this can be a backreference
        if(0 < this->rxtraits().value(*begin, 10))
        {
            // Parse at most 3 decimal digits.
            FwdIter tmp = begin;
            int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);

            // If the resulting number could conceivably be a backref, then it is.
            if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
            {
                begin = tmp;
                escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
                return esc;
            }
        }

        // Not a backreference, defer to the parse_escape helper
        return detail::parse_escape(begin, end, this->traits_);
    }

    bool is_upper_(char_type ch) const
    {
        return 0 != this->upper_ && this->rxtraits().isctype(ch, this->upper_);
    }

    std::size_t mark_count_;
    std::size_t hidden_mark_count_;
    CompilerTraits traits_;
    typename RegexTraits::char_class_type upper_;
    shared_ptr<detail::regex_impl<BidiIter> > self_;
    std::map<string_type, basic_regex<BidiIter> > rules_;
};

}} // namespace boost::xpressive

#endif