move/algo/adaptive_merge.hpp

0001 //////////////////////////////////////////////////////////////////////////////
0002 //
0003 // (C) Copyright Ion Gaztanaga 2015-2016.
0004 // Distributed under the Boost Software License, Version 1.0.
0005 // (See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt)
0007 //
0008 // See http://www.boost.org/libs/move for documentation.
0009 //
0010 //////////////////////////////////////////////////////////////////////////////
0011
0012 #ifndef BOOST_MOVE_ADAPTIVE_MERGE_HPP
0013 #define BOOST_MOVE_ADAPTIVE_MERGE_HPP
0014
0015 #include <boost/move/detail/config_begin.hpp>
0016 #include <boost/move/algo/detail/adaptive_sort_merge.hpp>
0017 #include <cassert>
0018
0019 #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
0020 #pragma GCC diagnostic push
0021 #pragma GCC diagnostic ignored "-Wsign-conversion"
0022 #pragma GCC diagnostic ignored "-Wconversion"
0023 #endif
0024
0025 namespace boost {
0026 namespace movelib {
0027
0028 ///@cond
0029 namespace detail_adaptive {
0030
0031 template<class RandIt, class Compare, class XBuf>
0032 inline void adaptive_merge_combine_blocks( RandIt first
0033                                       , typename iter_size<RandIt>::type len1
0034                                       , typename iter_size<RandIt>::type len2
0035                                       , typename iter_size<RandIt>::type collected
0036                                       , typename iter_size<RandIt>::type n_keys
0037                                       , typename iter_size<RandIt>::type l_block
0038                                       , bool use_internal_buf
0039                                       , bool xbuf_used
0040                                       , Compare comp
0041                                       , XBuf & xbuf
0042                                       )
0043 {
0044    typedef typename iter_size<RandIt>::type       size_type;
0045
0046    size_type const len = size_type(len1+len2);
0047    size_type const l_combine  = size_type(len-collected);
0048    size_type const l_combine1 = size_type(len1-collected);
0049
0050     if(n_keys){
0051       RandIt const first_data = first+collected;
0052       RandIt const keys = first;
0053       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A combine: ", len);
0054       if(xbuf_used){
0055          if(xbuf.size() < l_block){
0056             xbuf.initialize_until(l_block, *first);
0057          }
0058          assert(xbuf.size() >= l_block);
0059          size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
0060          combine_params( keys, comp, l_combine
0061                            , l_combine1, l_block, xbuf
0062                            , n_block_a, n_block_b, l_irreg1, l_irreg2);   //Outputs
0063          op_merge_blocks_with_buf
0064             (keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp, move_op(), xbuf.data());
0065          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   A mrg xbf: ", len);
0066       }
0067       else{
0068          size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
0069          combine_params( keys, comp, l_combine
0070                            , l_combine1, l_block, xbuf
0071                            , n_block_a, n_block_b, l_irreg1, l_irreg2);   //Outputs
0072          if(use_internal_buf){
0073             op_merge_blocks_with_buf
0074                ( keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b
0075                , l_irreg2, comp, swap_op(), first_data-l_block);
0076             BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A mrg buf: ", len);
0077          }
0078          else{
0079             merge_blocks_bufferless
0080                (keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp);
0081             BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   A mrg nbf: ", len);
0082          }
0083       }
0084    }
0085    else{
0086       xbuf.shrink_to_fit(l_block);
0087       if(xbuf.size() < l_block){
0088          xbuf.initialize_until(l_block, *first);
0089       }
0090       size_type *const uint_keys = xbuf.template aligned_trailing<size_type>(l_block);
0091       size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
0092       combine_params( uint_keys, less(), l_combine
0093                      , l_combine1, l_block, xbuf
0094                      , n_block_a, n_block_b, l_irreg1, l_irreg2, true);   //Outputs
0095       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A combine: ", len);
0096       assert(xbuf.size() >= l_block);
0097       op_merge_blocks_with_buf
0098          (uint_keys, less(), first, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp, move_op(), xbuf.data());
0099       xbuf.clear();
0100       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   A mrg buf: ", len);
0101    }
0102 }
0103
0104 template<class RandIt, class Compare, class XBuf>
0105 inline void adaptive_merge_final_merge( RandIt first
0106                                       , typename iter_size<RandIt>::type len1
0107                                       , typename iter_size<RandIt>::type len2
0108                                       , typename iter_size<RandIt>::type collected
0109                                       , typename iter_size<RandIt>::type l_intbuf
0110                                       , typename iter_size<RandIt>::type //l_block
0111                                       , bool //use_internal_buf
0112                                       , bool xbuf_used
0113                                       , Compare comp
0114                                       , XBuf & xbuf
0115                                       )
0116 {
0117    typedef typename iter_size<RandIt>::type       size_type;
0118
0119    size_type n_keys = size_type(collected-l_intbuf);
0120    size_type len = size_type(len1+len2);
0121    if (!xbuf_used || n_keys) {
0122       xbuf.clear();
0123       const size_type middle = xbuf_used && n_keys ? n_keys: collected;
0124       unstable_sort(first, first + middle, comp, xbuf);
0125       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A k/b srt: ", len);
0126       stable_merge(first, first + middle, first + len, comp, xbuf);
0127    }
0128    BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   A fin mrg: ", len);
0129 }
0130
0131 template<class SizeType>
0132 inline static SizeType adaptive_merge_n_keys_without_external_keys(SizeType l_block, SizeType len1, SizeType len2, SizeType l_intbuf)
0133 {
0134    typedef SizeType size_type;
0135    //This is the minimum number of keys to implement the ideal algorithm
0136    size_type n_keys = size_type(len1/l_block + len2/l_block);
0137    const size_type second_half_blocks = size_type(len2/l_block);
0138    const size_type first_half_aux = size_type(len1 - l_intbuf);
0139    while(n_keys >= ((first_half_aux-n_keys)/l_block + second_half_blocks)){
0140       --n_keys;
0141    }
0142    ++n_keys;
0143    return n_keys;
0144 }
0145
0146 template<class SizeType>
0147 inline static SizeType adaptive_merge_n_keys_with_external_keys(SizeType l_block, SizeType len1, SizeType len2, SizeType l_intbuf)
0148 {
0149    typedef SizeType size_type;
0150    //This is the minimum number of keys to implement the ideal algorithm
0151    size_type n_keys = size_type((len1-l_intbuf)/l_block + len2/l_block);
0152    return n_keys;
0153 }
0154
0155 template<class SizeType, class Xbuf>
0156 inline SizeType adaptive_merge_n_keys_intbuf(SizeType &rl_block, SizeType len1, SizeType len2, Xbuf & xbuf, SizeType &l_intbuf_inout)
0157 {
0158    typedef SizeType size_type;
0159    size_type l_block = rl_block;
0160    size_type l_intbuf = xbuf.capacity() >= l_block ? 0u : l_block;
0161
0162    if (xbuf.capacity() > l_block){
0163       l_block = xbuf.capacity();
0164    }
0165
0166    //This is the minimum number of keys to implement the ideal algorithm
0167    size_type n_keys = adaptive_merge_n_keys_without_external_keys(l_block, len1, len2, l_intbuf);
0168    assert(n_keys >= ((len1-l_intbuf-n_keys)/l_block + len2/l_block));
0169
0170    if(xbuf.template supports_aligned_trailing<size_type>
0171       ( l_block
0172       , adaptive_merge_n_keys_with_external_keys(l_block, len1, len2, l_intbuf)))
0173    {
0174       n_keys = 0u;
0175    }
0176    l_intbuf_inout = l_intbuf;
0177    rl_block = l_block;
0178    return n_keys;
0179 }
0180
0181 // Main explanation of the merge algorithm.
0182 //
0183 // csqrtlen = ceil(sqrt(len));
0184 //
0185 // * First, csqrtlen [to be used as buffer] + (len/csqrtlen - 1) [to be used as keys] => to_collect
0186 //   unique elements are extracted from elements to be sorted and placed in the beginning of the range.
0187 //
0188 // * Step "combine_blocks": the leading (len1-to_collect) elements plus trailing len2 elements
0189 //   are merged with a non-trivial ("smart") algorithm to form an ordered range trailing "len-to_collect" elements.
0190 //
0191 //   Explanation of the "combine_blocks" step:
0192 //
0193 //         * Trailing [first+to_collect, first+len1) elements are divided in groups of cqrtlen elements.
0194 //           Remaining elements that can't form a group are grouped in front of those elements.
0195 //         * Trailing [first+len1, first+len1+len2) elements are divided in groups of cqrtlen elements.
0196 //           Remaining elements that can't form a group are grouped in the back of those elements.
0197 //         * In parallel the following two steps are performed:
0198 //             *  Groups are selection-sorted by first or last element (depending whether they are going
0199 //                to be merged to left or right) and keys are reordered accordingly as an imitation-buffer.
0200 //             * Elements of each block pair are merged using the csqrtlen buffer taking into account
0201 //                if they belong to the first half or second half (marked by the key).
0202 //
0203 // * In the final merge step leading "to_collect" elements are merged with rotations
0204 //   with the rest of merged elements in the "combine_blocks" step.
0205 //
0206 // Corner cases:
0207 //
0208 // * If no "to_collect" elements can be extracted:
0209 //
0210 //    * If more than a minimum number of elements is extracted
0211 //      then reduces the number of elements used as buffer and keys in the
0212 //      and "combine_blocks" steps. If "combine_blocks" has no enough keys due to this reduction
0213 //      then uses a rotation based smart merge.
0214 //
0215 //    * If the minimum number of keys can't be extracted, a rotation-based merge is performed.
0216 //
0217 // * If auxiliary memory is more or equal than min(len1, len2), a buffered merge is performed.
0218 //
0219 // * If the len1 or len2 are less than 2*csqrtlen then a rotation-based merge is performed.
0220 //
0221 // * If auxiliary memory is more than csqrtlen+n_keys*sizeof(std::size_t),
0222 //   then no csqrtlen need to be extracted and "combine_blocks" will use integral
0223 //   keys to combine blocks.
0224 template<class RandIt, class Compare, class XBuf>
0225 void adaptive_merge_impl
0226    ( RandIt first
0227    , typename iter_size<RandIt>::type len1
0228    , typename iter_size<RandIt>::type len2
0229    , Compare comp
0230    , XBuf & xbuf
0231    )
0232 {
0233    typedef typename iter_size<RandIt>::type size_type;
0234
0235    if(xbuf.capacity() >= min_value<size_type>(len1, len2)){
0236       buffered_merge( first, first+len1
0237                     , first + len1+len2, comp, xbuf);
0238    }
0239    else{
0240       const size_type len = size_type(len1+len2);
0241       //Calculate ideal parameters and try to collect needed unique keys
0242       size_type l_block = size_type(ceil_sqrt(len));
0243
0244       //One range is not big enough to extract keys and the internal buffer so a
0245       //rotation-based based merge will do just fine
0246       if(len1 <= l_block*2 || len2 <= l_block*2){
0247          merge_bufferless(first, first+len1, first+len1+len2, comp);
0248          return;
0249       }
0250
0251       //Detail the number of keys and internal buffer. If xbuf has enough memory, no
0252       //internal buffer is needed so l_intbuf will remain 0.
0253       size_type l_intbuf = 0;
0254       size_type n_keys = adaptive_merge_n_keys_intbuf(l_block, len1, len2, xbuf, l_intbuf);
0255       size_type const to_collect = size_type(l_intbuf+n_keys);
0256       //Try to extract needed unique values from the first range
0257       size_type const collected  = collect_unique(first, first+len1, to_collect, comp, xbuf);
0258       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("\n   A collect: ", len);
0259
0260       //Not the minimum number of keys is not available on the first range, so fallback to rotations
0261       if(collected != to_collect && collected < 4){
0262          merge_bufferless(first, first+collected, first+len1, comp);
0263          merge_bufferless(first, first + len1, first + len1 + len2, comp);
0264          return;
0265       }
0266
0267       //If not enough keys but more than minimum, adjust the internal buffer and key count
0268       bool use_internal_buf = collected == to_collect;
0269       if (!use_internal_buf){
0270          l_intbuf = 0u;
0271          n_keys = collected;
0272          l_block  = lblock_for_combine(l_intbuf, n_keys, len, use_internal_buf);
0273          //If use_internal_buf is false, then then internal buffer will be zero and rotation-based combination will be used
0274          l_intbuf = use_internal_buf ? l_block : 0u;
0275       }
0276
0277       bool const xbuf_used = collected == to_collect && xbuf.capacity() >= l_block;
0278       //Merge trailing elements using smart merges
0279       adaptive_merge_combine_blocks(first, len1, len2, collected,   n_keys, l_block, use_internal_buf, xbuf_used, comp, xbuf);
0280       //Merge buffer and keys with the rest of the values
0281       adaptive_merge_final_merge   (first, len1, len2, collected, l_intbuf, l_block, use_internal_buf, xbuf_used, comp, xbuf);
0282    }
0283 }
0284
0285 }  //namespace detail_adaptive {
0286
0287 ///@endcond
0288
0289 //! <b>Effects</b>: Merges two consecutive sorted ranges [first, middle) and [middle, last)
0290 //!   into one sorted range [first, last) according to the given comparison function comp.
0291 //!   The algorithm is stable (if there are equivalent elements in the original two ranges,
0292 //!   the elements from the first range (preserving their original order) precede the elements
0293 //!   from the second range (preserving their original order).
0294 //!
0295 //! <b>Requires</b>:
0296 //!   - RandIt must meet the requirements of ValueSwappable and RandomAccessIterator.
0297 //!   - The type of dereferenced RandIt must meet the requirements of MoveAssignable and MoveConstructible.
0298 //!
0299 //! <b>Parameters</b>:
0300 //!   - first: the beginning of the first sorted range.
0301 //!   - middle: the end of the first sorted range and the beginning of the second
0302 //!   - last: the end of the second sorted range
0303 //!   - comp: comparison function object which returns true if the first argument is is ordered before the second.
0304 //!   - uninitialized, uninitialized_len: raw storage starting on "uninitialized", able to hold "uninitialized_len"
0305 //!      elements of type iterator_traits<RandIt>::value_type. Maximum performance is achieved when uninitialized_len
0306 //!      is min(std::distance(first, middle), std::distance(middle, last)).
0307 //!
0308 //! <b>Throws</b>: If comp throws or the move constructor, move assignment or swap of the type
0309 //!   of dereferenced RandIt throws.
0310 //!
0311 //! <b>Complexity</b>: Always K x O(N) comparisons and move assignments/constructors/swaps.
0312 //!   Constant factor for comparisons and data movement is minimized when uninitialized_len
0313 //!   is min(std::distance(first, middle), std::distance(middle, last)).
0314 //!   Pretty good enough performance is achieved when uninitialized_len is
0315 //!   ceil(sqrt(std::distance(first, last)))*2.
0316 //!
0317 //! <b>Caution</b>: Experimental implementation, not production-ready.
0318 template<class RandIt, class Compare>
0319 void adaptive_merge( RandIt first, RandIt middle, RandIt last, Compare comp
0320                 , typename iterator_traits<RandIt>::value_type* uninitialized = 0
0321                 , typename iter_size<RandIt>::type uninitialized_len = 0)
0322 {
0323    typedef typename iter_size<RandIt>::type  size_type;
0324    typedef typename iterator_traits<RandIt>::value_type value_type;
0325
0326    if (first == middle || middle == last){
0327       return;
0328    }
0329
0330    //Reduce ranges to merge if possible
0331    do {
0332       if (comp(*middle, *first)){
0333          break;
0334       }
0335       ++first;
0336       if (first == middle)
0337          return;
0338    } while(1);
0339
0340    RandIt first_high(middle);
0341    --first_high;
0342    do {
0343       --last;
0344       if (comp(*last, *first_high)){
0345          ++last;
0346          break;
0347       }
0348       if (last == middle)
0349          return;
0350    } while(1);
0351
0352    ::boost::movelib::adaptive_xbuf<value_type, value_type*, size_type> xbuf(uninitialized, size_type(uninitialized_len));
0353    ::boost::movelib::detail_adaptive::adaptive_merge_impl(first, size_type(middle - first), size_type(last - middle), comp, xbuf);
0354 }
0355
0356 }  //namespace movelib {
0357 }  //namespace boost {
0358
0359 #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
0360 #pragma GCC diagnostic pop
0361 #endif
0362
0363 #include <boost/move/detail/config_end.hpp>
0364
0365 #endif   //#define BOOST_MOVE_ADAPTIVE_MERGE_HPP