include/boost/token_functions.hpp

0001 // Boost token_functions.hpp  ------------------------------------------------//
0002
0003 // Copyright John R. Bandela 2001.
0004
0005 // Distributed under the Boost Software License, Version 1.0. (See
0006 // accompanying file LICENSE_1_0.txt or copy at
0007 // http://www.boost.org/LICENSE_1_0.txt)
0008
0009 // See http://www.boost.org/libs/tokenizer/ for documentation.
0010
0011 // Revision History:
0012 // 01 Oct 2004   Joaquin M Lopez Munoz
0013 //      Workaround for a problem with string::assign in msvc-stlport
0014 // 06 Apr 2004   John Bandela
0015 //      Fixed a bug involving using char_delimiter with a true input iterator
0016 // 28 Nov 2003   Robert Zeh and John Bandela
0017 //      Converted into "fast" functions that avoid using += when
0018 //      the supplied iterator isn't an input_iterator; based on
0019 //      some work done at Archelon and a version that was checked into
0020 //      the boost CVS for a short period of time.
0021 // 20 Feb 2002   John Maddock
0022 //      Removed using namespace std declarations and added
0023 //      workaround for BOOST_NO_STDC_NAMESPACE (the library
0024 //      can be safely mixed with regex).
0025 // 06 Feb 2002   Jeremy Siek
0026 //      Added char_separator.
0027 // 02 Feb 2002   Jeremy Siek
0028 //      Removed tabs and a little cleanup.
0029
0030
0031 #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
0032 #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
0033
0034 #include <vector>
0035 #include <stdexcept>
0036 #include <string>
0037 #include <cctype>
0038 #include <algorithm> // for find_if
0039 #include <boost/config.hpp>
0040 #include <boost/assert.hpp>
0041 #include <boost/type_traits/conditional.hpp>
0042 #include <boost/type_traits/is_pointer.hpp>
0043 #include <boost/detail/workaround.hpp>
0044 #include <boost/throw_exception.hpp>
0045 #if !defined(BOOST_NO_CWCTYPE)
0046 #include <cwctype>
0047 #endif
0048
0049 //
0050 // the following must not be macros if we are to prefix them
0051 // with std:: (they shouldn't be macros anyway...)
0052 //
0053 #ifdef ispunct
0054 #  undef ispunct
0055 #endif
0056 #ifdef iswpunct
0057 #  undef iswpunct
0058 #endif
0059 #ifdef isspace
0060 #  undef isspace
0061 #endif
0062 #ifdef iswspace
0063 #  undef iswspace
0064 #endif
0065 //
0066 // fix namespace problems:
0067 //
0068 #ifdef BOOST_NO_STDC_NAMESPACE
0069 namespace std{
0070  using ::ispunct;
0071  using ::isspace;
0072 #if !defined(BOOST_NO_CWCTYPE)
0073  using ::iswpunct;
0074  using ::iswspace;
0075 #endif
0076 }
0077 #endif
0078
0079 namespace boost{
0080   //===========================================================================
0081   // The escaped_list_separator class. Which is a model of TokenizerFunction
0082   // An escaped list is a super-set of what is commonly known as a comma
0083   // separated value (csv) list.It is separated into fields by a comma or
0084   // other character. If the delimiting character is inside quotes, then it is
0085   // counted as a regular character.To allow for embedded quotes in a field,
0086   // there can be escape sequences using the \ much like C.
0087   // The role of the comma, the quotation mark, and the escape
0088   // character (backslash \), can be assigned to other characters.
0089
0090   struct escaped_list_error : public std::runtime_error{
0091     escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { }
0092   };
0093
0094
0095 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
0096 // MSVC does not like the following typename
0097   template <class Char,
0098     class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
0099   class escaped_list_separator {
0100
0101   private:
0102     typedef std::basic_string<Char,Traits> string_type;
0103     struct char_eq {
0104       Char e_;
0105       char_eq(Char e):e_(e) { }
0106       bool operator()(Char c) {
0107         return Traits::eq(e_,c);
0108       }
0109     };
0110     string_type  escape_;
0111     string_type  c_;
0112     string_type  quote_;
0113     bool last_;
0114
0115     bool is_escape(Char e) {
0116       char_eq f(e);
0117       return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
0118     }
0119     bool is_c(Char e) {
0120       char_eq f(e);
0121       return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
0122     }
0123     bool is_quote(Char e) {
0124       char_eq f(e);
0125       return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
0126     }
0127     template <typename iterator, typename Token>
0128     void do_escape(iterator& next,iterator end,Token& tok) {
0129       if (++next == end)
0130         BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape")));
0131       if (Traits::eq(*next,'n')) {
0132         tok+='\n';
0133         return;
0134       }
0135       else if (is_quote(*next)) {
0136         tok+=*next;
0137         return;
0138       }
0139       else if (is_c(*next)) {
0140         tok+=*next;
0141         return;
0142       }
0143       else if (is_escape(*next)) {
0144         tok+=*next;
0145         return;
0146       }
0147       else
0148         BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence")));
0149     }
0150
0151     public:
0152
0153     explicit escaped_list_separator(Char  e = '\\',
0154                                     Char c = ',',Char  q = '\"')
0155       : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
0156
0157     escaped_list_separator(string_type e, string_type c, string_type q)
0158       : escape_(e), c_(c), quote_(q), last_(false) { }
0159
0160     void reset() {last_=false;}
0161
0162     template <typename InputIterator, typename Token>
0163     bool operator()(InputIterator& next,InputIterator end,Token& tok) {
0164       bool bInQuote = false;
0165       tok = Token();
0166
0167       if (next == end) {
0168         if (last_) {
0169           last_ = false;
0170           return true;
0171         }
0172         else
0173           return false;
0174       }
0175       last_ = false;
0176       for (;next != end;++next) {
0177         if (is_escape(*next)) {
0178           do_escape(next,end,tok);
0179         }
0180         else if (is_c(*next)) {
0181           if (!bInQuote) {
0182             // If we are not in quote, then we are done
0183             ++next;
0184             // The last character was a c, that means there is
0185             // 1 more blank field
0186             last_ = true;
0187             return true;
0188           }
0189           else tok+=*next;
0190         }
0191         else if (is_quote(*next)) {
0192           bInQuote=!bInQuote;
0193         }
0194         else {
0195           tok += *next;
0196         }
0197       }
0198       return true;
0199     }
0200   };
0201
0202   //===========================================================================
0203   // The classes here are used by offset_separator and char_separator to implement
0204   // faster assigning of tokens using assign instead of +=
0205
0206   namespace tokenizer_detail {
0207   //===========================================================================
0208   // Tokenizer was broken for wide character separators, at least on Windows, since
0209   // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts
0210   // if higher values are passed in. The traits extension class should take care of this.
0211   // Assuming that the conditional will always get optimized out in the function
0212   // implementations, argument types are not a problem since both forms of character classifiers
0213   // expect an int.
0214
0215 #if !defined(BOOST_NO_CWCTYPE)
0216   template<typename traits, int N>
0217   struct traits_extension_details : public traits {
0218     typedef typename traits::char_type char_type;
0219     static bool isspace(char_type c)
0220     {
0221        return std::iswspace(c) != 0;
0222     }
0223     static bool ispunct(char_type c)
0224     {
0225        return std::iswpunct(c) != 0;
0226     }
0227   };
0228
0229   template<typename traits>
0230   struct traits_extension_details<traits, 1> : public traits {
0231     typedef typename traits::char_type char_type;
0232     static bool isspace(char_type c)
0233     {
0234        return std::isspace(c) != 0;
0235     }
0236     static bool ispunct(char_type c)
0237     {
0238        return std::ispunct(c) != 0;
0239     }
0240   };
0241 #endif
0242
0243
0244   // In case there is no cwctype header, we implement the checks manually.
0245   // We make use of the fact that the tested categories should fit in ASCII.
0246   template<typename traits>
0247   struct traits_extension : public traits {
0248     typedef typename traits::char_type char_type;
0249     static bool isspace(char_type c)
0250     {
0251 #if !defined(BOOST_NO_CWCTYPE)
0252       return traits_extension_details<traits, sizeof(char_type)>::isspace(c);
0253 #else
0254       return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0;
0255 #endif
0256     }
0257
0258     static bool ispunct(char_type c)
0259     {
0260 #if !defined(BOOST_NO_CWCTYPE)
0261       return traits_extension_details<traits, sizeof(char_type)>::ispunct(c);
0262 #else
0263       return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0;
0264 #endif
0265     }
0266   };
0267
0268   // The assign_or_plus_equal struct contains functions that implement
0269   // assign, +=, and clearing based on the iterator type.  The
0270   // generic case does nothing for plus_equal and clearing, while
0271   // passing through the call for assign.
0272   //
0273   // When an input iterator is being used, the situation is reversed.
0274   // The assign method does nothing, plus_equal invokes operator +=,
0275   // and the clearing method sets the supplied token to the default
0276   // token constructor's result.
0277   //
0278
0279   template<class IteratorTag>
0280   struct assign_or_plus_equal {
0281     template<class Iterator, class Token>
0282     static void assign(Iterator b, Iterator e, Token &t) {
0283       t.assign(b, e);
0284     }
0285
0286     template<class Token, class Value>
0287     static void plus_equal(Token &, const Value &) { }
0288
0289     // If we are doing an assign, there is no need for the
0290     // the clear.
0291     //
0292     template<class Token>
0293     static void clear(Token &) { }
0294   };
0295
0296   template <>
0297   struct assign_or_plus_equal<std::input_iterator_tag> {
0298     template<class Iterator, class Token>
0299     static void assign(Iterator , Iterator , Token &) { }
0300     template<class Token, class Value>
0301     static void plus_equal(Token &t, const Value &v) {
0302       t += v;
0303     }
0304     template<class Token>
0305     static void clear(Token &t) {
0306       t = Token();
0307     }
0308   };
0309
0310
0311   template<class Iterator>
0312   struct pointer_iterator_category{
0313     typedef std::random_access_iterator_tag type;
0314   };
0315
0316
0317   template<class Iterator>
0318   struct class_iterator_category{
0319     typedef typename Iterator::iterator_category type;
0320   };
0321
0322
0323
0324   // This portably gets the iterator_tag without partial template specialization
0325   template<class Iterator>
0326     struct get_iterator_category{
0327     typedef typename conditional<is_pointer<Iterator>::value,
0328       pointer_iterator_category<Iterator>,
0329       class_iterator_category<Iterator>
0330     >::type cat;
0331
0332     typedef typename cat::type iterator_category;
0333   };
0334
0335
0336   } // namespace tokenizer_detail
0337
0338
0339   //===========================================================================
0340   // The offset_separator class, which is a model of TokenizerFunction.
0341   // Offset breaks a string into tokens based on a range of offsets
0342
0343   class offset_separator {
0344   private:
0345
0346     std::vector<int> offsets_;
0347     unsigned int current_offset_;
0348     bool wrap_offsets_;
0349     bool return_partial_last_;
0350
0351   public:
0352     template <typename Iter>
0353     offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
0354                      bool return_partial_last = true)
0355       : offsets_(begin,end), current_offset_(0),
0356         wrap_offsets_(wrap_offsets),
0357         return_partial_last_(return_partial_last) { }
0358
0359     offset_separator()
0360       : offsets_(1,1), current_offset_(),
0361         wrap_offsets_(true), return_partial_last_(true) { }
0362
0363     void reset() {
0364       current_offset_ = 0;
0365     }
0366
0367     template <typename InputIterator, typename Token>
0368     bool operator()(InputIterator& next, InputIterator end, Token& tok)
0369     {
0370       typedef tokenizer_detail::assign_or_plus_equal<
0371         BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
0372           InputIterator
0373         >::iterator_category
0374       > assigner;
0375
0376       BOOST_ASSERT(!offsets_.empty());
0377
0378       assigner::clear(tok);
0379       InputIterator start(next);
0380
0381       if (next == end)
0382         return false;
0383
0384       if (current_offset_ == offsets_.size())
0385       {
0386         if (wrap_offsets_)
0387           current_offset_=0;
0388         else
0389           return false;
0390       }
0391
0392       int c = offsets_[current_offset_];
0393       int i = 0;
0394       for (; i < c; ++i) {
0395         if (next == end)break;
0396         assigner::plus_equal(tok,*next++);
0397       }
0398       assigner::assign(start,next,tok);
0399
0400       if (!return_partial_last_)
0401         if (i < (c-1) )
0402           return false;
0403
0404       ++current_offset_;
0405       return true;
0406     }
0407   };
0408
0409
0410   //===========================================================================
0411   // The char_separator class breaks a sequence of characters into
0412   // tokens based on the character delimiters (very much like bad old
0413   // strtok). A delimiter character can either be kept or dropped. A
0414   // kept delimiter shows up as an output token, whereas a dropped
0415   // delimiter does not.
0416
0417   // This class replaces the char_delimiters_separator class. The
0418   // constructor for the char_delimiters_separator class was too
0419   // confusing and needed to be deprecated. However, because of the
0420   // default arguments to the constructor, adding the new constructor
0421   // would cause ambiguity, so instead I deprecated the whole class.
0422   // The implementation of the class was also simplified considerably.
0423
0424   enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
0425
0426   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
0427   template <typename Char,
0428     typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
0429   class char_separator
0430   {
0431     typedef tokenizer_detail::traits_extension<Tr> Traits;
0432     typedef std::basic_string<Char,Tr> string_type;
0433   public:
0434     explicit
0435     char_separator(const Char* dropped_delims,
0436                    const Char* kept_delims = 0,
0437                    empty_token_policy empty_tokens = drop_empty_tokens)
0438       : m_dropped_delims(dropped_delims),
0439         m_use_ispunct(false),
0440         m_use_isspace(false),
0441         m_empty_tokens(empty_tokens),
0442         m_output_done(false)
0443     {
0444       // Borland workaround
0445       if (kept_delims)
0446         m_kept_delims = kept_delims;
0447     }
0448
0449                 // use ispunct() for kept delimiters and isspace for dropped.
0450     explicit
0451     char_separator()
0452       : m_use_ispunct(true),
0453         m_use_isspace(true),
0454         m_empty_tokens(drop_empty_tokens),
0455         m_output_done(false) { }
0456
0457     void reset() { }
0458
0459     template <typename InputIterator, typename Token>
0460     bool operator()(InputIterator& next, InputIterator end, Token& tok)
0461     {
0462       typedef tokenizer_detail::assign_or_plus_equal<
0463         BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
0464           InputIterator
0465         >::iterator_category
0466       > assigner;
0467
0468       assigner::clear(tok);
0469
0470       // skip past all dropped_delims
0471       if (m_empty_tokens == drop_empty_tokens)
0472         for (; next != end  && is_dropped(*next); ++next)
0473           { }
0474
0475       InputIterator start(next);
0476
0477       if (m_empty_tokens == drop_empty_tokens) {
0478
0479         if (next == end)
0480           return false;
0481
0482
0483         // if we are on a kept_delims move past it and stop
0484         if (is_kept(*next)) {
0485           assigner::plus_equal(tok,*next);
0486           ++next;
0487         } else
0488           // append all the non delim characters
0489           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
0490             assigner::plus_equal(tok,*next);
0491       }
0492       else { // m_empty_tokens == keep_empty_tokens
0493
0494         // Handle empty token at the end
0495         if (next == end)
0496         {
0497           if (m_output_done == false)
0498           {
0499             m_output_done = true;
0500             assigner::assign(start,next,tok);
0501             return true;
0502           }
0503           else
0504             return false;
0505         }
0506
0507         if (is_kept(*next)) {
0508           if (m_output_done == false)
0509             m_output_done = true;
0510           else {
0511             assigner::plus_equal(tok,*next);
0512             ++next;
0513             m_output_done = false;
0514           }
0515         }
0516         else if (m_output_done == false && is_dropped(*next)) {
0517           m_output_done = true;
0518         }
0519         else {
0520           if (is_dropped(*next))
0521             start=++next;
0522           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
0523             assigner::plus_equal(tok,*next);
0524           m_output_done = true;
0525         }
0526       }
0527       assigner::assign(start,next,tok);
0528       return true;
0529     }
0530
0531   private:
0532     string_type m_kept_delims;
0533     string_type m_dropped_delims;
0534     bool m_use_ispunct;
0535     bool m_use_isspace;
0536     empty_token_policy m_empty_tokens;
0537     bool m_output_done;
0538
0539     bool is_kept(Char E) const
0540     {
0541       if (m_kept_delims.length())
0542         return m_kept_delims.find(E) != string_type::npos;
0543       else if (m_use_ispunct) {
0544         return Traits::ispunct(E) != 0;
0545       } else
0546         return false;
0547     }
0548     bool is_dropped(Char E) const
0549     {
0550       if (m_dropped_delims.length())
0551         return m_dropped_delims.find(E) != string_type::npos;
0552       else if (m_use_isspace) {
0553         return Traits::isspace(E) != 0;
0554       } else
0555         return false;
0556     }
0557   };
0558
0559   //===========================================================================
0560   // The following class is DEPRECATED, use class char_separators instead.
0561   //
0562   // The char_delimiters_separator class, which is a model of
0563   // TokenizerFunction.  char_delimiters_separator breaks a string
0564   // into tokens based on character delimiters. There are 2 types of
0565   // delimiters. returnable delimiters can be returned as
0566   // tokens. These are often punctuation. nonreturnable delimiters
0567   // cannot be returned as tokens. These are often whitespace
0568
0569   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
0570   template <class Char,
0571     class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
0572   class char_delimiters_separator {
0573   private:
0574
0575     typedef tokenizer_detail::traits_extension<Tr> Traits;
0576     typedef std::basic_string<Char,Tr> string_type;
0577     string_type returnable_;
0578     string_type nonreturnable_;
0579     bool return_delims_;
0580     bool no_ispunct_;
0581     bool no_isspace_;
0582
0583     bool is_ret(Char E)const
0584     {
0585       if (returnable_.length())
0586         return  returnable_.find(E) != string_type::npos;
0587       else{
0588         if (no_ispunct_) {return false;}
0589         else{
0590           int r = Traits::ispunct(E);
0591           return r != 0;
0592         }
0593       }
0594     }
0595     bool is_nonret(Char E)const
0596     {
0597       if (nonreturnable_.length())
0598         return  nonreturnable_.find(E) != string_type::npos;
0599       else{
0600         if (no_isspace_) {return false;}
0601         else{
0602           int r = Traits::isspace(E);
0603           return r != 0;
0604         }
0605       }
0606     }
0607
0608   public:
0609     explicit char_delimiters_separator(bool return_delims = false,
0610                                        const Char* returnable = 0,
0611                                        const Char* nonreturnable = 0)
0612       : returnable_(returnable ? returnable : string_type().c_str()),
0613         nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
0614         return_delims_(return_delims), no_ispunct_(returnable!=0),
0615         no_isspace_(nonreturnable!=0) { }
0616
0617     void reset() { }
0618
0619   public:
0620
0621      template <typename InputIterator, typename Token>
0622      bool operator()(InputIterator& next, InputIterator end,Token& tok) {
0623      tok = Token();
0624
0625      // skip past all nonreturnable delims
0626      // skip past the returnable only if we are not returning delims
0627      for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
0628        && !return_delims_ ) );++next) { }
0629
0630      if (next == end) {
0631        return false;
0632      }
0633
0634      // if we are to return delims and we are one a returnable one
0635      // move past it and stop
0636      if (is_ret(*next) && return_delims_) {
0637        tok+=*next;
0638        ++next;
0639      }
0640      else
0641        // append all the non delim characters
0642        for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
0643          tok+=*next;
0644
0645
0646      return true;
0647    }
0648   };
0649
0650
0651 } //namespace boost
0652
0653 #endif