Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

PrevUpHomeNext
Unicode Aware Regex Iterators
u32regex_iterator

Type u32regex_iterator is in all respects the same as regex_iterator except that since the regular expression type is always u32regex it only takes one template parameter (the iterator type). It also calls u32regex_search internally, allowing it to interface correctly with UTF-8, UTF-16, and UTF-32 data:

template <class BidirectionalIterator>
class u32regex_iterator
{
   // for members see regex_iterator
};

typedef u32regex_iterator<const char*>     utf8regex_iterator;
typedef u32regex_iterator<const UChar*>    utf16regex_iterator;
typedef u32regex_iterator<const UChar32*>  utf32regex_iterator;

In order to simplify the construction of a u32regex_iterator from a string, there are a series of non-member helper functions called make_u32regex_iterator:

u32regex_iterator<const char*>
   make_u32regex_iterator(const char* s,
                          const u32regex& e,
                          regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_iterator<const wchar_t*>
   make_u32regex_iterator(const wchar_t* s,
                          const u32regex& e,
                          regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_iterator<const UChar*>
   make_u32regex_iterator(const UChar* s,
                          const u32regex& e,
                          regex_constants::match_flag_type m = regex_constants::match_default);

template <class charT, class Traits, class Alloc>
u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
   make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s,
                          const u32regex& e,
                          regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_iterator<const UChar*>
   make_u32regex_iterator(const UnicodeString& s,
                          const u32regex& e,
                          regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates all occurrences of expression e, in text s, using match_flags m.

Example: search for international currency symbols, along with their associated numeric value:

void enumerate_currencies(const std::string& text)
{
   // enumerate and print all the currency symbols, along
   // with any associated numeric values:
   const char* re =
      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
      "(?(1)"
         "|(?(2)"
            "[[:Cf:][:Cc:][:Z*:]]*"
         ")"
         "[[:Sc:]]"
      ")";
   boost::u32regex r = boost::make_u32regex(re);
   boost::u32regex_iterator<std::string::const_iterator>
         i(boost::make_u32regex_iterator(text, r)), j;
   while(i != j)
   {
      std::cout << (*i)[0] << std::endl;
      ++i;
   }
}

Calling

enumerate_currencies(" $100.23 or £198.12 ");

Yields the output:

$100.23
£198.12

Provided of course that the input is encoded as UTF-8.

u32regex_token_iterator

Type u32regex_token_iterator is in all respects the same as regex_token_iterator except that since the regular expression type is always u32regex it only takes one template parameter (the iterator type). It also calls u32regex_search internally, allowing it to interface correctly with UTF-8, UTF-16, and UTF-32 data:

template <class BidirectionalIterator>
class u32regex_token_iterator
{
   // for members see regex_token_iterator
};

typedef u32regex_token_iterator<const char*>     utf8regex_token_iterator;
typedef u32regex_token_iterator<const UChar*>    utf16regex_token_iterator;
typedef u32regex_token_iterator<const UChar32*>  utf32regex_token_iterator;

In order to simplify the construction of a u32regex_token_iterator from a string, there are a series of non-member helper functions called make_u32regex_token_iterator:

u32regex_token_iterator<const char*>
   make_u32regex_token_iterator(
         const char* s,
         const u32regex& e,
         int sub,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const wchar_t*>
   make_u32regex_token_iterator(
         const wchar_t* s,
         const u32regex& e,
         int sub,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UChar* s,
         const u32regex& e,
         int sub,
         regex_constants::match_flag_type m = regex_constants::match_default);

template <class charT, class Traits, class Alloc>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& s,
         const u32regex& e,
         int sub,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UnicodeString& s,
         const u32regex& e,
         int sub,
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates all occurrences of marked sub-expression sub in regular expression e, found in text s, using match_flags m.

template <std::size_t N>
u32regex_token_iterator<const char*>
   make_u32regex_token_iterator(
         const char* p,
         const u32regex& e,
         const int (&submatch)[N],
         regex_constants::match_flag_type m = regex_constants::match_default);

template <std::size_t N>
u32regex_token_iterator<const wchar_t*>
   make_u32regex_token_iterator(
         const wchar_t* p,
         const u32regex& e,
         const int (&submatch)[N],
         regex_constants::match_flag_type m = regex_constants::match_default);

template <std::size_t N>
u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UChar* p,
         const u32regex& e,
         const int (&submatch)[N],
         regex_constants::match_flag_type m = regex_constants::match_default);

template <class charT, class Traits, class Alloc, std::size_t N>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& p,
         const u32regex& e,
         const int (&submatch)[N],
         regex_constants::match_flag_type m = regex_constants::match_default);

template <std::size_t N>
u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UnicodeString& s,
         const u32regex& e,
         const int (&submatch)[N],
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates one sub-expression for each submatch in regular expression e, found in text s, using match_flags m.

u32regex_token_iterator<const char*>
   make_u32regex_token_iterator(
         const char* p,
         const u32regex& e,
         const std::vector<int>& submatch,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const wchar_t*>
   make_u32regex_token_iterator(
         const wchar_t* p,
         const u32regex& e,
         const std::vector<int>& submatch,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UChar* p,
         const u32regex& e,
         const std::vector<int>& submatch,
         regex_constants::match_flag_type m = regex_constants::match_default);

template <class charT, class Traits, class Alloc>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& p,
         const u32regex& e,
         const std::vector<int>& submatch,
         regex_constants::match_flag_type m = regex_constants::match_default);

u32regex_token_iterator<const UChar*>
   make_u32regex_token_iterator(
         const UnicodeString& s,
         const u32regex& e,
         const std::vector<int>& submatch,
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates one sub-expression for each submatch in regular expression e, found in text s, using match_flags m.

Example: search for international currency symbols, along with their associated numeric value:

void enumerate_currencies2(const std::string& text)
{
   // enumerate and print all the currency symbols, along
   // with any associated numeric values:
   const char* re =
      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
      "(?(1)"
         "|(?(2)"
            "[[:Cf:][:Cc:][:Z*:]]*"
         ")"
         "[[:Sc:]]"
      ")";
   boost::u32regex r = boost::make_u32regex(re);
   boost::u32regex_token_iterator<std::string::const_iterator>
      i(boost::make_u32regex_token_iterator(text, r, 1)), j;
   while(i != j)
   {
      std::cout << *i << std::endl;
      ++i;
   }
}

PrevUpHomeNext