move/algo/adaptive_sort.hpp

0001 //////////////////////////////////////////////////////////////////////////////
0002 //
0003 // (C) Copyright Ion Gaztanaga 2015-2016.
0004 // Distributed under the Boost Software License, Version 1.0.
0005 // (See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt)
0007 //
0008 // See http://www.boost.org/libs/move for documentation.
0009 //
0010 //////////////////////////////////////////////////////////////////////////////
0011
0012 #ifndef BOOST_MOVE_ADAPTIVE_SORT_HPP
0013 #define BOOST_MOVE_ADAPTIVE_SORT_HPP
0014
0015 #include <boost/move/detail/config_begin.hpp>
0016
0017 #include <boost/move/algo/detail/adaptive_sort_merge.hpp>
0018 #include <cassert>
0019
0020 #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
0021 #pragma GCC diagnostic push
0022 #pragma GCC diagnostic ignored "-Wsign-conversion"
0023 #pragma GCC diagnostic ignored "-Wconversion"
0024 #endif
0025
0026 namespace boost {
0027 namespace movelib {
0028
0029 ///@cond
0030 namespace detail_adaptive {
0031
0032 template<class RandIt>
0033 void move_data_backward( RandIt cur_pos
0034               , typename iter_size<RandIt>::type const l_data
0035               , RandIt new_pos
0036               , bool const xbuf_used)
0037 {
0038    //Move buffer to the total combination right
0039    if(xbuf_used){
0040       boost::move_backward(cur_pos, cur_pos+l_data, new_pos+l_data);
0041    }
0042    else{
0043       boost::adl_move_swap_ranges_backward(cur_pos, cur_pos+l_data, new_pos+l_data);
0044       //Rotate does less moves but it seems slower due to cache issues
0045       //rotate_gcd(first-l_block, first+len-l_block, first+len);
0046    }
0047 }
0048
0049 template<class RandIt>
0050 void move_data_forward( RandIt cur_pos
0051               , typename iter_size<RandIt>::type const l_data
0052               , RandIt new_pos
0053               , bool const xbuf_used)
0054 {
0055    //Move buffer to the total combination right
0056    if(xbuf_used){
0057       boost::move(cur_pos, cur_pos+l_data, new_pos);
0058    }
0059    else{
0060       boost::adl_move_swap_ranges(cur_pos, cur_pos+l_data, new_pos);
0061       //Rotate does less moves but it seems slower due to cache issues
0062       //rotate_gcd(first-l_block, first+len-l_block, first+len);
0063    }
0064 }
0065
0066 // build blocks of length 2*l_build_buf. l_build_buf is power of two
0067 // input: [0, l_build_buf) elements are buffer, rest unsorted elements
0068 // output: [0, l_build_buf) elements are buffer, blocks 2*l_build_buf and last subblock sorted
0069 //
0070 // First elements are merged from right to left until elements start
0071 // at first. All old elements [first, first + l_build_buf) are placed at the end
0072 // [first+len-l_build_buf, first+len). To achieve this:
0073 // - If we have external memory to merge, we save elements from the buffer
0074 //   so that a non-swapping merge is used. Buffer elements are restored
0075 //   at the end of the buffer from the external memory.
0076 //
0077 // - When the external memory is not available or it is insufficient
0078 //   for a merge operation, left swap merging is used.
0079 //
0080 // Once elements are merged left to right in blocks of l_build_buf, then a single left
0081 // to right merge step is performed to achieve merged blocks of size 2K.
0082 // If external memory is available, usual merge is used, swap merging otherwise.
0083 //
0084 // As a last step, if auxiliary memory is available in-place merge is performed.
0085 // until all is merged or auxiliary memory is not large enough.
0086 template<class RandIt, class Compare, class XBuf>
0087 typename iter_size<RandIt>::type
0088    adaptive_sort_build_blocks
0089       ( RandIt const first
0090       , typename iter_size<RandIt>::type const len
0091       , typename iter_size<RandIt>::type const l_base
0092       , typename iter_size<RandIt>::type const l_build_buf
0093       , XBuf & xbuf
0094       , Compare comp)
0095 {
0096    typedef typename iter_size<RandIt>::type       size_type;
0097    assert(l_build_buf <= len);
0098    assert(0 == ((l_build_buf / l_base)&(l_build_buf/l_base-1)));
0099
0100    //Place the start pointer after the buffer
0101    RandIt first_block = first + l_build_buf;
0102    size_type const elements_in_blocks = size_type(len - l_build_buf);
0103
0104    //////////////////////////////////
0105    // Start of merge to left step
0106    //////////////////////////////////
0107    size_type l_merged = 0u;
0108
0109    assert(l_build_buf);
0110    //If there is no enough buffer for the insertion sort step, just avoid the external buffer
0111    size_type kbuf = min_value<size_type>(l_build_buf, size_type(xbuf.capacity()));
0112    kbuf = kbuf < l_base ? 0 : kbuf;
0113
0114    if(kbuf){
0115       //Backup internal buffer values in external buffer so they can be overwritten
0116       xbuf.move_assign(first+l_build_buf-kbuf, kbuf);
0117       l_merged = op_insertion_sort_step_left(first_block, elements_in_blocks, l_base, comp, move_op());
0118
0119       //Now combine them using the buffer. Elements from buffer can be
0120       //overwritten since they've been saved to xbuf
0121       l_merged = op_merge_left_step_multiple
0122          ( first_block - l_merged, elements_in_blocks, l_merged, l_build_buf, size_type(kbuf - l_merged), comp, move_op());
0123
0124       //Restore internal buffer from external buffer unless kbuf was l_build_buf,
0125       //in that case restoration will happen later
0126       if(kbuf != l_build_buf){
0127          boost::move(xbuf.data()+kbuf-l_merged, xbuf.data() + kbuf, first_block-l_merged+elements_in_blocks);
0128       }
0129    }
0130    else{
0131       l_merged = insertion_sort_step(first_block, elements_in_blocks, l_base, comp);
0132       rotate_gcd(first_block-l_merged, first_block, first_block+elements_in_blocks);
0133    }
0134
0135    //Now combine elements using the buffer. Elements from buffer can't be
0136    //overwritten since xbuf was not big enough, so merge swapping elements.
0137    l_merged = op_merge_left_step_multiple
0138       (first_block-l_merged, elements_in_blocks, l_merged, l_build_buf, size_type(l_build_buf - l_merged), comp, swap_op());
0139
0140    assert(l_merged == l_build_buf);
0141
0142    //////////////////////////////////
0143    // Start of merge to right step
0144    //////////////////////////////////
0145
0146    //If kbuf is l_build_buf then we can merge right without swapping
0147    //Saved data is still in xbuf
0148    if(kbuf && kbuf == l_build_buf){
0149       op_merge_right_step_once(first, elements_in_blocks, l_build_buf, comp, move_op());
0150       //Restore internal buffer from external buffer if kbuf was l_build_buf.
0151       //as this operation was previously delayed.
0152       boost::move(xbuf.data(), xbuf.data() + kbuf, first);
0153    }
0154    else{
0155       op_merge_right_step_once(first, elements_in_blocks, l_build_buf, comp, swap_op());
0156    }
0157    xbuf.clear();
0158    //2*l_build_buf or total already merged
0159    return min_value<size_type>(elements_in_blocks, size_type(2u*l_build_buf));
0160 }
0161
0162 template<class RandItKeys, class KeyCompare, class RandIt, class Compare, class XBuf>
0163 void adaptive_sort_combine_blocks
0164    ( RandItKeys const keys
0165    , KeyCompare key_comp
0166    , RandIt const first
0167    , typename iter_size<RandIt>::type const len
0168    , typename iter_size<RandIt>::type const l_prev_merged
0169    , typename iter_size<RandIt>::type const l_block
0170    , bool const use_buf
0171    , bool const xbuf_used
0172    , XBuf & xbuf
0173    , Compare comp
0174    , bool merge_left)
0175 {
0176    boost::movelib::ignore(xbuf);
0177    typedef typename iter_size<RandIt>::type         size_type;
0178
0179    size_type const l_reg_combined   = size_type(2u*l_prev_merged);
0180    size_type l_irreg_combined = 0;
0181    size_type const l_total_combined = calculate_total_combined(len, l_prev_merged, &l_irreg_combined);
0182    size_type const n_reg_combined = len/l_reg_combined;
0183    RandIt combined_first = first;
0184
0185    boost::movelib::ignore(l_total_combined);
0186    assert(l_total_combined <= len);
0187
0188    size_type const max_i = size_type(n_reg_combined + (l_irreg_combined != 0));
0189
0190    if(merge_left || !use_buf) {
0191       for( size_type combined_i = 0; combined_i != max_i; ) {
0192          //Now merge blocks
0193          bool const is_last = combined_i==n_reg_combined;
0194          size_type const l_cur_combined = is_last ? l_irreg_combined : l_reg_combined;
0195
0196          range_xbuf<RandIt, size_type, move_op> rbuf( (use_buf && xbuf_used) ? (combined_first-l_block) : combined_first, combined_first);
0197          size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
0198          combine_params( keys, key_comp, l_cur_combined
0199                         , l_prev_merged, l_block, rbuf
0200                         , n_block_a, n_block_b, l_irreg1, l_irreg2);   //Outputs
0201          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A combpar:            ", len + l_block);
0202          BOOST_MOVE_ADAPTIVE_SORT_INVARIANT(boost::movelib::is_sorted(combined_first, combined_first + n_block_a*l_block+l_irreg1, comp));
0203             BOOST_MOVE_ADAPTIVE_SORT_INVARIANT(boost::movelib::is_sorted(combined_first + n_block_a*l_block+l_irreg1, combined_first + n_block_a*l_block+l_irreg1+n_block_b*l_block+l_irreg2, comp));
0204          if(!use_buf){
0205             merge_blocks_bufferless
0206                (keys, key_comp, combined_first, l_block, 0u, n_block_a, n_block_b, l_irreg2, comp);
0207          }
0208          else{
0209             merge_blocks_left
0210                (keys, key_comp, combined_first, l_block, 0u, n_block_a, n_block_b, l_irreg2, comp, xbuf_used);
0211          }
0212          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   After merge_blocks_L: ", len + l_block);
0213          ++combined_i;
0214          if(combined_i != max_i)
0215             combined_first += l_reg_combined;
0216       }
0217    }
0218    else{
0219       combined_first += size_type(l_reg_combined*(max_i-1u));
0220       for( size_type combined_i = max_i; combined_i; ) {
0221          --combined_i;
0222          bool const is_last = combined_i==n_reg_combined;
0223          size_type const l_cur_combined = is_last ? l_irreg_combined : l_reg_combined;
0224
0225          RandIt const combined_last(combined_first+l_cur_combined);
0226          range_xbuf<RandIt, size_type, move_op> rbuf(combined_last, xbuf_used ? (combined_last+l_block) : combined_last);
0227          size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
0228          combine_params( keys, key_comp, l_cur_combined
0229                         , l_prev_merged, l_block, rbuf
0230                         , n_block_a, n_block_b, l_irreg1, l_irreg2);  //Outputs
0231          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   A combpar:            ", len + l_block);
0232          BOOST_MOVE_ADAPTIVE_SORT_INVARIANT(boost::movelib::is_sorted(combined_first, combined_first + n_block_a*l_block+l_irreg1, comp));
0233          BOOST_MOVE_ADAPTIVE_SORT_INVARIANT(boost::movelib::is_sorted(combined_first + n_block_a*l_block+l_irreg1, combined_first + n_block_a*l_block+l_irreg1+n_block_b*l_block+l_irreg2, comp));
0234          merge_blocks_right
0235             (keys, key_comp, combined_first, l_block, n_block_a, n_block_b, l_irreg2, comp, xbuf_used);
0236          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   After merge_blocks_R: ", len + l_block);
0237          if(combined_i)
0238             combined_first -= l_reg_combined;
0239       }
0240    }
0241 }
0242
0243 //Returns true if buffer is placed in
0244 //[buffer+len-l_intbuf, buffer+len). Otherwise, buffer is
0245 //[buffer,buffer+l_intbuf)
0246 template<class RandIt, class Compare, class XBuf>
0247 bool adaptive_sort_combine_all_blocks
0248    ( RandIt keys
0249    , typename iter_size<RandIt>::type &n_keys
0250    , RandIt const buffer
0251    , typename iter_size<RandIt>::type const l_buf_plus_data
0252    , typename iter_size<RandIt>::type l_merged
0253    , typename iter_size<RandIt>::type &l_intbuf
0254    , XBuf & xbuf
0255    , Compare comp)
0256 {
0257    typedef typename iter_size<RandIt>::type       size_type;
0258
0259    RandIt const first = buffer + l_intbuf;
0260    size_type const l_data = size_type(l_buf_plus_data - l_intbuf);
0261    size_type const l_unique = size_type(l_intbuf + n_keys);
0262    //Backup data to external buffer once if possible
0263    bool const common_xbuf = l_data > l_merged && l_intbuf && l_intbuf <= xbuf.capacity();
0264    if(common_xbuf){
0265       xbuf.move_assign(buffer, l_intbuf);
0266    }
0267
0268    bool prev_merge_left = true;
0269    size_type l_prev_total_combined = l_merged, l_prev_block = 0;
0270    bool prev_use_internal_buf = true;
0271
0272    for( size_type n = 0; l_data > l_merged
0273       ; l_merged = size_type(2u*l_merged)
0274       , ++n){
0275       //If l_intbuf is non-zero, use that internal buffer.
0276       //    Implies l_block == l_intbuf && use_internal_buf == true
0277       //If l_intbuf is zero, see if half keys can be reused as a reduced emergency buffer,
0278       //    Implies l_block == n_keys/2 && use_internal_buf == true
0279       //Otherwise, just give up and and use all keys to merge using rotations (use_internal_buf = false)
0280       bool use_internal_buf = false;
0281       size_type const l_block = lblock_for_combine(l_intbuf, n_keys, size_type(2*l_merged), use_internal_buf);
0282       assert(!l_intbuf || (l_block == l_intbuf));
0283       assert(n == 0 || (!use_internal_buf || prev_use_internal_buf) );
0284       assert(n == 0 || (!use_internal_buf || l_prev_block == l_block) );
0285
0286       bool const is_merge_left = (n&1) == 0;
0287       size_type const l_total_combined = calculate_total_combined(l_data, l_merged);
0288       if(n && prev_use_internal_buf && prev_merge_left){
0289          if(is_merge_left || !use_internal_buf){
0290             move_data_backward(first-l_prev_block, l_prev_total_combined, first, common_xbuf);
0291          }
0292          else{
0293             //Put the buffer just after l_total_combined
0294             RandIt const buf_end = first+l_prev_total_combined;
0295             RandIt const buf_beg = buf_end-l_block;
0296             if(l_prev_total_combined > l_total_combined){
0297                size_type const l_diff = size_type(l_prev_total_combined - l_total_combined);
0298                move_data_backward(buf_beg-l_diff, l_diff, buf_end-l_diff, common_xbuf);
0299             }
0300             else if(l_prev_total_combined < l_total_combined){
0301                size_type const l_diff = size_type(l_total_combined - l_prev_total_combined);
0302                move_data_forward(buf_end, l_diff, buf_beg, common_xbuf);
0303             }
0304          }
0305          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2("   After move_data     : ", l_data + l_intbuf);
0306       }
0307
0308       //Combine to form l_merged*2 segments
0309       if(n_keys){
0310          size_type upper_n_keys_this_iter = size_type(2u*l_merged/l_block);
0311          if(upper_n_keys_this_iter > 256){
0312             adaptive_sort_combine_blocks
0313                ( keys, comp, !use_internal_buf || is_merge_left ? first : first-l_block
0314                , l_data, l_merged, l_block, use_internal_buf, common_xbuf, xbuf, comp, is_merge_left);
0315          }
0316          else{
0317             unsigned char uint_keys[256];
0318             adaptive_sort_combine_blocks
0319                ( uint_keys, less(), !use_internal_buf || is_merge_left ? first : first-l_block
0320                , l_data, l_merged, l_block, use_internal_buf, common_xbuf, xbuf, comp, is_merge_left);
0321             }
0322       }
0323       else{
0324          size_type *const uint_keys = xbuf.template aligned_trailing<size_type>();
0325          adaptive_sort_combine_blocks
0326             ( uint_keys, less(), !use_internal_buf || is_merge_left ? first : first-l_block
0327             , l_data, l_merged, l_block, use_internal_buf, common_xbuf, xbuf, comp, is_merge_left);
0328       }
0329
0330       BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1(is_merge_left ? "   After comb blocks L:  " : "   After comb blocks R:  ", l_data + l_intbuf);
0331       prev_merge_left = is_merge_left;
0332       l_prev_total_combined = l_total_combined;
0333       l_prev_block = l_block;
0334       prev_use_internal_buf = use_internal_buf;
0335    }
0336    assert(l_prev_total_combined == l_data);
0337    bool const buffer_right = prev_use_internal_buf && prev_merge_left;
0338
0339    l_intbuf = prev_use_internal_buf ? l_prev_block : 0u;
0340    n_keys = size_type(l_unique - l_intbuf);
0341    //Restore data from to external common buffer if used
0342    if(common_xbuf){
0343       if(buffer_right){
0344          boost::move(xbuf.data(), xbuf.data() + l_intbuf, buffer+l_data);
0345       }
0346       else{
0347          boost::move(xbuf.data(), xbuf.data() + l_intbuf, buffer);
0348       }
0349    }
0350    return buffer_right;
0351 }
0352
0353
0354 template<class RandIt, class Compare, class XBuf>
0355 void adaptive_sort_final_merge( bool buffer_right
0356                               , RandIt const first
0357                               , typename iter_size<RandIt>::type const l_intbuf
0358                               , typename iter_size<RandIt>::type const n_keys
0359                               , typename iter_size<RandIt>::type const len
0360                               , XBuf & xbuf
0361                               , Compare comp)
0362 {
0363    //assert(n_keys || xbuf.size() == l_intbuf);
0364    xbuf.clear();
0365
0366    typedef typename iter_size<RandIt>::type         size_type;
0367
0368    size_type const n_key_plus_buf = size_type(l_intbuf+n_keys);
0369    if(buffer_right){
0370       //Use stable sort as some buffer elements might not be unique (see non_unique_buf)
0371       stable_sort(first+len-l_intbuf, first+len, comp, xbuf);
0372       stable_merge( first+n_keys, first+len-l_intbuf, first+len,    antistable<Compare>(comp), xbuf);
0373       unstable_sort(first, first+n_keys, comp, xbuf);
0374       stable_merge(first, first+n_keys, first+len, comp, xbuf);
0375    }
0376    else{
0377       //Use stable sort as some buffer elements might not be unique (see non_unique_buf)
0378       stable_sort(first, first+n_key_plus_buf, comp, xbuf);
0379       if(xbuf.capacity() >= n_key_plus_buf){
0380          buffered_merge(first, first+n_key_plus_buf, first+len, comp, xbuf);
0381       }
0382       else if(xbuf.capacity() >= min_value<size_type>(l_intbuf, n_keys)){
0383          stable_merge( first+n_keys, first+n_key_plus_buf
0384                      , first+len, comp, xbuf);
0385          stable_merge(first, first+n_keys, first+len, comp, xbuf);
0386       }
0387       else{
0388          stable_merge(first, first+n_key_plus_buf, first+len, comp, xbuf);
0389       }
0390    }
0391    BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   After final_merge   : ", len);
0392 }
0393
0394 template<class RandIt, class Compare, class Unsigned, class XBuf>
0395 bool adaptive_sort_build_params
0396    (RandIt first, Unsigned const len, Compare comp
0397    , Unsigned &n_keys, Unsigned &l_intbuf, Unsigned &l_base, Unsigned &l_build_buf
0398    , XBuf & xbuf
0399    )
0400 {
0401    typedef typename iter_size<RandIt>::type         size_type;
0402
0403    //Calculate ideal parameters and try to collect needed unique keys
0404    l_base = 0u;
0405
0406    //Try to find a value near sqrt(len) that is 2^N*l_base where
0407    //l_base <= AdaptiveSortInsertionSortThreshold. This property is important
0408    //as build_blocks merges to the left iteratively duplicating the
0409    //merged size and all the buffer must be used just before the final
0410    //merge to right step. This guarantees "build_blocks" produces
0411    //segments of size l_build_buf*2, maximizing the classic merge phase.
0412    l_intbuf = size_type(ceil_sqrt_multiple(len, &l_base));
0413
0414    //The internal buffer can be expanded if there is enough external memory
0415    while(xbuf.capacity() >= l_intbuf*2){
0416       l_intbuf = size_type(2u*l_intbuf);
0417    }
0418
0419    //This is the minimum number of keys to implement the ideal algorithm
0420    //
0421    //l_intbuf is used as buffer plus the key count
0422    size_type n_min_ideal_keys = size_type(l_intbuf-1u);
0423    while(n_min_ideal_keys >= (len-l_intbuf-n_min_ideal_keys)/l_intbuf){
0424       --n_min_ideal_keys;
0425    }
0426    ++n_min_ideal_keys;
0427    assert(n_min_ideal_keys <= l_intbuf);
0428
0429    if(xbuf.template supports_aligned_trailing<size_type>
0430          (l_intbuf, size_type((size_type(len-l_intbuf)-1u)/l_intbuf+1u))){
0431       n_keys = 0u;
0432       l_build_buf = l_intbuf;
0433    }
0434    else{
0435       //Try to achieve a l_build_buf of length l_intbuf*2, so that we can merge with that
0436       //l_intbuf*2 buffer in "build_blocks" and use half of them as buffer and the other half
0437       //as keys in combine_all_blocks. In that case n_keys >= n_min_ideal_keys but by a small margin.
0438       //
0439       //If available memory is 2*sqrt(l), then only sqrt(l) unique keys are needed,
0440       //(to be used for keys in combine_all_blocks) as the whole l_build_buf
0441       //will be backuped in the buffer during build_blocks.
0442       bool const non_unique_buf = xbuf.capacity() >= l_intbuf;
0443       size_type const to_collect = non_unique_buf ? n_min_ideal_keys : size_type(l_intbuf*2u);
0444       size_type collected = collect_unique(first, first+len, to_collect, comp, xbuf);
0445
0446       //If available memory is 2*sqrt(l), then for "build_params"
0447       //the situation is the same as if 2*l_intbuf were collected.
0448       if(non_unique_buf && collected == n_min_ideal_keys){
0449          l_build_buf = l_intbuf;
0450          n_keys = n_min_ideal_keys;
0451       }
0452       else if(collected == 2*l_intbuf){
0453          //l_intbuf*2 elements found. Use all of them in the build phase
0454          l_build_buf = size_type(l_intbuf*2);
0455          n_keys = l_intbuf;
0456       }
0457       else if(collected >= (n_min_ideal_keys+l_intbuf)){
0458          l_build_buf = l_intbuf;
0459          n_keys = size_type(collected - l_intbuf);
0460       }
0461       //If collected keys are not enough, try to fix n_keys and l_intbuf. If no fix
0462       //is possible (due to very low unique keys), then go to a slow sort based on rotations.
0463       else{
0464          assert(collected < (n_min_ideal_keys+l_intbuf));
0465          if(collected < 4){  //No combination possible with less that 4 keys
0466             return false;
0467          }
0468          n_keys = l_intbuf;
0469          while(n_keys & (n_keys-1u)){
0470             n_keys &= size_type(n_keys-1u);  // make it power or 2
0471          }
0472          while(n_keys > collected){
0473             n_keys/=2;
0474          }
0475          //AdaptiveSortInsertionSortThreshold is always power of two so the minimum is power of two
0476          l_base = min_value<Unsigned>(n_keys, AdaptiveSortInsertionSortThreshold);
0477          l_intbuf = 0;
0478          l_build_buf = n_keys;
0479       }
0480       assert((n_keys+l_intbuf) >= l_build_buf);
0481    }
0482
0483    return true;
0484 }
0485
0486 // Main explanation of the sort algorithm.
0487 //
0488 // csqrtlen = ceil(sqrt(len));
0489 //
0490 // * First, 2*csqrtlen unique elements elements are extracted from elements to be
0491 //   sorted and placed in the beginning of the range.
0492 //
0493 // * Step "build_blocks": In this nearly-classic merge step, 2*csqrtlen unique elements
0494 //   will be used as auxiliary memory, so trailing len-2*csqrtlen elements are
0495 //   are grouped in blocks of sorted 4*csqrtlen elements. At the end of the step
0496 //   2*csqrtlen unique elements are again the leading elements of the whole range.
0497 //
0498 // * Step "combine_blocks": pairs of previously formed blocks are merged with a different
0499 //   ("smart") algorithm to form blocks of 8*csqrtlen elements. This step is slower than the
0500 //   "build_blocks" step and repeated iteratively (forming blocks of 16*csqrtlen, 32*csqrtlen
0501 //   elements, etc) of until all trailing (len-2*csqrtlen) elements are merged.
0502 //
0503 //   In "combine_blocks" len/csqrtlen elements used are as "keys" (markers) to
0504 //   know if elements belong to the first or second block to be merged and another
0505 //   leading csqrtlen elements are used as buffer. Explanation of the "combine_blocks" step:
0506 //
0507 //   Iteratively until all trailing (len-2*csqrtlen) elements are merged:
0508 //      Iteratively for each pair of previously merged block:
0509 //         * Blocks are divided groups of csqrtlen elements and
0510 //           2*merged_block/csqrtlen keys are sorted to be used as markers
0511 //         * Groups are selection-sorted by first or last element (depending whether they are going
0512 //           to be merged to left or right) and keys are reordered accordingly as an imitation-buffer.
0513 //         * Elements of each block pair are merged using the csqrtlen buffer taking into account
0514 //           if they belong to the first half or second half (marked by the key).
0515 //
0516 // * In the final merge step leading elements (2*csqrtlen) are sorted and merged with
0517 //   rotations with the rest of sorted elements in the "combine_blocks" step.
0518 //
0519 // Corner cases:
0520 //
0521 // * If no 2*csqrtlen elements can be extracted:
0522 //
0523 //    * If csqrtlen+len/csqrtlen are extracted, then only csqrtlen elements are used
0524 //      as buffer in the "build_blocks" step forming blocks of 2*csqrtlen elements. This
0525 //      means that an additional "combine_blocks" step will be needed to merge all elements.
0526 //
0527 //    * If no csqrtlen+len/csqrtlen elements can be extracted, but still more than a minimum,
0528 //      then reduces the number of elements used as buffer and keys in the "build_blocks"
0529 //      and "combine_blocks" steps. If "combine_blocks" has no enough keys due to this reduction
0530 //      then uses a rotation based smart merge.
0531 //
0532 //    * If the minimum number of keys can't be extracted, a rotation-based sorting is performed.
0533 //
0534 // * If auxiliary memory is more or equal than ceil(len/2), half-copying mergesort is used.
0535 //
0536 // * If auxiliary memory is more than csqrtlen+n_keys*sizeof(std::size_t),
0537 //   then only csqrtlen elements need to be extracted and "combine_blocks" will use integral
0538 //   keys to combine blocks.
0539 //
0540 // * If auxiliary memory is available, the "build_blocks" will be extended to build bigger blocks
0541 //   using classic merge and "combine_blocks" will use bigger blocks when merging.
0542 template<class RandIt, class Compare, class XBuf>
0543 void adaptive_sort_impl
0544    ( RandIt first
0545    , typename iter_size<RandIt>::type const len
0546    , Compare comp
0547    , XBuf & xbuf
0548    )
0549 {
0550    typedef typename iter_size<RandIt>::type         size_type;
0551
0552    //Small sorts go directly to insertion sort
0553    if(len <= size_type(AdaptiveSortInsertionSortThreshold)){
0554       insertion_sort(first, first + len, comp);
0555    }
0556    else if((len-len/2) <= xbuf.capacity()){
0557       merge_sort(first, first+len, comp, xbuf.data());
0558    }
0559    else{
0560       //Make sure it is at least four
0561       BOOST_MOVE_STATIC_ASSERT(AdaptiveSortInsertionSortThreshold >= 4);
0562
0563       size_type l_base = 0;
0564       size_type l_intbuf = 0;
0565       size_type n_keys = 0;
0566       size_type l_build_buf = 0;
0567
0568       //Calculate and extract needed unique elements. If a minimum is not achieved
0569       //fallback to a slow stable sort
0570       if(!adaptive_sort_build_params(first, len, comp, n_keys, l_intbuf, l_base, l_build_buf, xbuf)){
0571          stable_sort(first, first+len, comp, xbuf);
0572       }
0573       else{
0574          assert(l_build_buf);
0575          //Otherwise, continue the adaptive_sort
0576          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("\n   After collect_unique: ", len);
0577          size_type const n_key_plus_buf = size_type(l_intbuf+n_keys);
0578          //l_build_buf is always power of two if l_intbuf is zero
0579          assert(l_intbuf || (0 == (l_build_buf & (l_build_buf-1))));
0580
0581          //Classic merge sort until internal buffer and xbuf are exhausted
0582          size_type const l_merged = adaptive_sort_build_blocks
0583             ( first + n_key_plus_buf-l_build_buf
0584             , size_type(len-n_key_plus_buf+l_build_buf)
0585             , l_base, l_build_buf, xbuf, comp);
0586          BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("   After build_blocks:   ", len);
0587
0588          //Non-trivial merge
0589          bool const buffer_right = adaptive_sort_combine_all_blocks
0590             (first, n_keys, first+n_keys, size_type(len-n_keys), l_merged, l_intbuf, xbuf, comp);
0591
0592          //Sort keys and buffer and merge the whole sequence
0593          adaptive_sort_final_merge(buffer_right, first, l_intbuf, n_keys, len, xbuf, comp);
0594       }
0595    }
0596 }
0597
0598 }  //namespace detail_adaptive {
0599
0600 ///@endcond
0601
0602 //! <b>Effects</b>: Sorts the elements in the range [first, last) in ascending order according
0603 //!   to comparison functor "comp". The sort is stable (order of equal elements
0604 //!   is guaranteed to be preserved). Performance is improved if additional raw storage is
0605 //!   provided.
0606 //!
0607 //! <b>Requires</b>:
0608 //!   - RandIt must meet the requirements of ValueSwappable and RandomAccessIterator.
0609 //!   - The type of dereferenced RandIt must meet the requirements of MoveAssignable and MoveConstructible.
0610 //!
0611 //! <b>Parameters</b>:
0612 //!   - first, last: the range of elements to sort
0613 //!   - comp: comparison function object which returns true if the first argument is is ordered before the second.
0614 //!   - uninitialized, uninitialized_len: raw storage starting on "uninitialized", able to hold "uninitialized_len"
0615 //!      elements of type iterator_traits<RandIt>::value_type. Maximum performance is achieved when uninitialized_len
0616 //!      is ceil(std::distance(first, last)/2).
0617 //!
0618 //! <b>Throws</b>: If comp throws or the move constructor, move assignment or swap of the type
0619 //!   of dereferenced RandIt throws.
0620 //!
0621 //! <b>Complexity</b>: Always K x O(Nxlog(N)) comparisons and move assignments/constructors/swaps.
0622 //!   Comparisons are close to minimum even with no additional memory. Constant factor for data movement is minimized
0623 //!   when uninitialized_len is ceil(std::distance(first, last)/2). Pretty good enough performance is achieved when
0624 //!   ceil(sqrt(std::distance(first, last)))*2.
0625 //!
0626 //! <b>Caution</b>: Experimental implementation, not production-ready.
0627 template<class RandIt, class RandRawIt, class Compare>
0628 void adaptive_sort( RandIt first, RandIt last, Compare comp
0629                , RandRawIt uninitialized
0630                , typename iter_size<RandIt>::type uninitialized_len)
0631 {
0632    typedef typename iter_size<RandIt>::type  size_type;
0633    typedef typename iterator_traits<RandIt>::value_type value_type;
0634
0635    ::boost::movelib::adaptive_xbuf<value_type, RandRawIt, size_type> xbuf(uninitialized, uninitialized_len);
0636    ::boost::movelib::detail_adaptive::adaptive_sort_impl(first, size_type(last - first), comp, xbuf);
0637 }
0638
0639 template<class RandIt, class Compare>
0640 void adaptive_sort( RandIt first, RandIt last, Compare comp)
0641 {
0642    typedef typename iterator_traits<RandIt>::value_type value_type;
0643    adaptive_sort(first, last, comp, (value_type*)0, 0u);
0644 }
0645
0646 }  //namespace movelib {
0647 }  //namespace boost {
0648
0649 #include <boost/move/detail/config_end.hpp>
0650
0651 #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
0652 #pragma GCC diagnostic pop
0653 #endif
0654
0655 #endif   //#define BOOST_MOVE_ADAPTIVE_SORT_HPP