Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:29:54

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010 
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
0013 
0014 #include <algorithm>
0015 
0016 #include <boost/compute/types.hpp>
0017 #include <boost/compute/command_queue.hpp>
0018 #include <boost/compute/algorithm/copy.hpp>
0019 #include <boost/compute/allocator/pinned_allocator.hpp>
0020 #include <boost/compute/container/vector.hpp>
0021 #include <boost/compute/detail/meta_kernel.hpp>
0022 #include <boost/compute/detail/iterator_range_size.hpp>
0023 #include <boost/compute/detail/parameter_cache.hpp>
0024 #include <boost/compute/memory/local_buffer.hpp>
0025 #include <boost/compute/type_traits/type_name.hpp>
0026 #include <boost/compute/utility/program_cache.hpp>
0027 
0028 namespace boost {
0029 namespace compute {
0030 namespace detail {
0031 
0032 template<class InputIterator>
0033 bool find_extrema_with_reduce_requirements_met(InputIterator first,
0034                                                InputIterator last,
0035                                                command_queue &queue)
0036 {
0037     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0038 
0039     const device &device = queue.get_device();
0040 
0041     // device must have dedicated local memory storage
0042     // otherwise reduction would be highly inefficient
0043     if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
0044     {
0045         return false;
0046     }
0047 
0048     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0049     // local memory size in bytes (per compute unit)
0050     const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
0051 
0052     std::string cache_key = std::string("__boost_find_extrema_reduce_")
0053         + type_name<input_type>();
0054     // load parameters
0055     boost::shared_ptr<parameter_cache> parameters =
0056         detail::parameter_cache::get_global_cache(device);
0057 
0058     // Get preferred work group size
0059     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0060 
0061     work_group_size = (std::min)(max_work_group_size, work_group_size);
0062 
0063     // local memory size needed to perform parallel reduction
0064     size_t required_local_mem_size = 0;
0065     // indices size
0066     required_local_mem_size += sizeof(uint_) * work_group_size;
0067     // values size
0068     required_local_mem_size += sizeof(input_type) * work_group_size;
0069 
0070     // at least 4 work groups per compute unit otherwise reduction
0071     // would be highly inefficient
0072     return ((required_local_mem_size * 4) <= local_mem_size);
0073 }
0074 
0075 /// \internal_
0076 /// Algorithm finds the first extremum in given range, i.e., with the lowest
0077 /// index.
0078 ///
0079 /// If \p use_input_idx is false, it's assumed that input data is ordered by
0080 /// increasing index and \p input_idx is not used in the algorithm.
0081 template<class InputIterator, class ResultIterator, class Compare>
0082 inline void find_extrema_with_reduce(InputIterator input,
0083                                      vector<uint_>::iterator input_idx,
0084                                      size_t count,
0085                                      ResultIterator result,
0086                                      vector<uint_>::iterator result_idx,
0087                                      size_t work_groups_no,
0088                                      size_t work_group_size,
0089                                      Compare compare,
0090                                      const bool find_minimum,
0091                                      const bool use_input_idx,
0092                                      command_queue &queue)
0093 {
0094     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0095 
0096     const context &context = queue.get_context();
0097 
0098     meta_kernel k("find_extrema_reduce");
0099     size_t count_arg = k.add_arg<uint_>("count");
0100     size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
0101     size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
0102 
0103     k <<
0104         // Work item global id
0105         k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
0106 
0107         // Index of element that will be read from input buffer
0108         k.decl<uint_>("idx") << " = gid;\n" <<
0109 
0110         k.decl<input_type>("acc") << ";\n" <<
0111         k.decl<uint_>("acc_idx") << ";\n" <<
0112         "if(gid < count) {\n" <<
0113             // Real index of currently best element
0114             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0115             k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
0116             "#else\n" <<
0117             k.var<uint_>("acc_idx") << " = idx;\n" <<
0118             "#endif\n" <<
0119 
0120             // Init accumulator with first[get_global_id(0)]
0121             "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
0122             "idx += get_global_size(0);\n" <<
0123         "}\n" <<
0124 
0125         k.decl<bool>("compare_result") << ";\n" <<
0126         k.decl<bool>("equal") << ";\n\n" <<
0127         "while( idx < count ){\n" <<
0128             // Next element
0129             k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
0130             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0131             k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
0132             "#endif\n" <<
0133 
0134             // Comparison between currently best element (acc) and next element
0135             "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
0136             "compare_result = " << compare(k.var<input_type>("next"),
0137                                            k.var<input_type>("acc")) << ";\n" <<
0138             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0139             "equal = !compare_result && !" <<
0140                 compare(k.var<input_type>("acc"),
0141                         k.var<input_type>("next")) << ";\n" <<
0142             "# endif\n" <<
0143             "#else\n" <<
0144             "compare_result = " << compare(k.var<input_type>("acc"),
0145                                            k.var<input_type>("next")) << ";\n" <<
0146             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0147             "equal = !compare_result && !" <<
0148                 compare(k.var<input_type>("next"),
0149                         k.var<input_type>("acc")) << ";\n" <<
0150             "# endif\n" <<
0151             "#endif\n" <<
0152 
0153             // save the winner
0154             "acc = compare_result ? acc : next;\n" <<
0155             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0156             "acc_idx = compare_result ? " <<
0157                 "acc_idx : " <<
0158                 "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
0159             "#else\n" <<
0160             "acc_idx = compare_result ? acc_idx : idx;\n" <<
0161             "#endif\n" <<
0162             "idx += get_global_size(0);\n" <<
0163         "}\n\n" <<
0164 
0165         // Work item local id
0166         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0167         "block[lid] = acc;\n" <<
0168         "block_idx[lid] = acc_idx;\n" <<
0169         "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0170 
0171         k.decl<uint_>("group_offset") <<
0172             " = count - (get_local_size(0) * get_group_id(0));\n\n";
0173 
0174     k <<
0175         "#pragma unroll\n"
0176         "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
0177              "offset = offset / 2) {\n" <<
0178              "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
0179                  k.decl<input_type>("mine") << " = block[lid];\n" <<
0180                  k.decl<input_type>("other") << " = block[lid+offset];\n" <<
0181                  "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
0182                  "compare_result = " << compare(k.var<input_type>("other"),
0183                                                 k.var<input_type>("mine")) << ";\n" <<
0184                  "equal = !compare_result && !" <<
0185                      compare(k.var<input_type>("mine"),
0186                              k.var<input_type>("other")) << ";\n" <<
0187                  "#else\n" <<
0188                  "compare_result = " << compare(k.var<input_type>("mine"),
0189                                                 k.var<input_type>("other")) << ";\n" <<
0190                  "equal = !compare_result && !" <<
0191                      compare(k.var<input_type>("other"),
0192                              k.var<input_type>("mine")) << ";\n" <<
0193                  "#endif\n" <<
0194                  "block[lid] = compare_result ? mine : other;\n" <<
0195                  k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
0196                  k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
0197                  "block_idx[lid] = compare_result ? " <<
0198                      "mine_idx : " <<
0199                      "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
0200              "}\n"
0201              "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0202         "}\n\n" <<
0203 
0204          // write block result to global output
0205         "if(lid == 0){\n" <<
0206             result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
0207             result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
0208         "}";
0209 
0210     std::string options;
0211     if(!find_minimum){
0212         options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
0213     }
0214     if(use_input_idx){
0215         options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
0216     }
0217 
0218     kernel kernel = k.compile(context, options);
0219 
0220     kernel.set_arg(count_arg, static_cast<uint_>(count));
0221     kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
0222     kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
0223 
0224     queue.enqueue_1d_range_kernel(kernel,
0225                                   0,
0226                                   work_groups_no * work_group_size,
0227                                   work_group_size);
0228 }
0229 
0230 template<class InputIterator, class ResultIterator, class Compare>
0231 inline void find_extrema_with_reduce(InputIterator input,
0232                                      size_t count,
0233                                      ResultIterator result,
0234                                      vector<uint_>::iterator result_idx,
0235                                      size_t work_groups_no,
0236                                      size_t work_group_size,
0237                                      Compare compare,
0238                                      const bool find_minimum,
0239                                      command_queue &queue)
0240 {
0241     // dummy will not be used
0242     buffer_iterator<uint_> dummy = result_idx;
0243     return find_extrema_with_reduce(
0244         input, dummy, count, result, result_idx, work_groups_no,
0245         work_group_size, compare, find_minimum, false, queue
0246     );
0247 }
0248 
0249 // Space complexity: \Omega(2 * work-group-size * work-groups-per-compute-unit)
0250 template<class InputIterator, class Compare>
0251 InputIterator find_extrema_with_reduce(InputIterator first,
0252                                        InputIterator last,
0253                                        Compare compare,
0254                                        const bool find_minimum,
0255                                        command_queue &queue)
0256 {
0257     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0258     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0259 
0260     const context &context = queue.get_context();
0261     const device &device = queue.get_device();
0262 
0263     // Getting information about used queue and device
0264     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0265     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0266 
0267     const size_t count = detail::iterator_range_size(first, last);
0268 
0269     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
0270         + type_name<input_type>();
0271 
0272     // load parameters
0273     boost::shared_ptr<parameter_cache> parameters =
0274         detail::parameter_cache::get_global_cache(device);
0275 
0276     // get preferred work group size and preferred number
0277     // of work groups per compute unit
0278     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0279     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
0280 
0281     // calculate work group size and number of work groups
0282     work_group_size = (std::min)(max_work_group_size, work_group_size);
0283     size_t work_groups_no = compute_units_no * work_groups_per_cu;
0284     work_groups_no = (std::min)(
0285         work_groups_no,
0286         static_cast<size_t>(std::ceil(float(count) / work_group_size))
0287     );
0288 
0289     // phase I: finding candidates for extremum
0290 
0291     // device buffors for extremum candidates and their indices
0292     // each work-group computes its candidate
0293     vector<input_type> candidates(work_groups_no, context);
0294     vector<uint_> candidates_idx(work_groups_no, context);
0295 
0296     // finding candidates for first extremum and their indices
0297     find_extrema_with_reduce(
0298         first, count, candidates.begin(), candidates_idx.begin(),
0299         work_groups_no, work_group_size, compare, find_minimum, queue
0300     );
0301 
0302     // phase II: finding extremum from among the candidates
0303 
0304     // zero-copy buffers for final result (value and index)
0305     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
0306         result(1, context);
0307     vector<uint_, ::boost::compute::pinned_allocator<uint_> >
0308         result_idx(1, context);
0309 
0310     // get extremum from among the candidates
0311     find_extrema_with_reduce(
0312         candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
0313         result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
0314     );
0315 
0316     // mapping extremum index to host
0317     uint_* result_idx_host_ptr =
0318         static_cast<uint_*>(
0319             queue.enqueue_map_buffer(
0320                 result_idx.get_buffer(), command_queue::map_read,
0321                 0, sizeof(uint_)
0322             )
0323         );
0324 
0325     return first + static_cast<difference_type>(*result_idx_host_ptr);
0326 }
0327 
0328 template<class InputIterator>
0329 InputIterator find_extrema_with_reduce(InputIterator first,
0330                                        InputIterator last,
0331                                        ::boost::compute::less<
0332                                            typename std::iterator_traits<
0333                                                InputIterator
0334                                            >::value_type
0335                                        >
0336                                        compare,
0337                                        const bool find_minimum,
0338                                        command_queue &queue)
0339 {
0340     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0341     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0342 
0343     const context &context = queue.get_context();
0344     const device &device = queue.get_device();
0345 
0346     // Getting information about used queue and device
0347     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0348     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0349 
0350     const size_t count = detail::iterator_range_size(first, last);
0351 
0352     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
0353         + type_name<input_type>();
0354 
0355     // load parameters
0356     boost::shared_ptr<parameter_cache> parameters =
0357         detail::parameter_cache::get_global_cache(device);
0358 
0359     // get preferred work group size and preferred number
0360     // of work groups per compute unit
0361     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0362     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
0363 
0364     // calculate work group size and number of work groups
0365     work_group_size = (std::min)(max_work_group_size, work_group_size);
0366     size_t work_groups_no = compute_units_no * work_groups_per_cu;
0367     work_groups_no = (std::min)(
0368         work_groups_no,
0369         static_cast<size_t>(std::ceil(float(count) / work_group_size))
0370     );
0371 
0372     // phase I: finding candidates for extremum
0373 
0374     // device buffors for extremum candidates and their indices
0375     // each work-group computes its candidate
0376     // zero-copy buffers are used to eliminate copying data back to host
0377     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
0378         candidates(work_groups_no, context);
0379     vector<uint_, ::boost::compute::pinned_allocator <uint_> >
0380         candidates_idx(work_groups_no, context);
0381 
0382     // finding candidates for first extremum and their indices
0383     find_extrema_with_reduce(
0384         first, count, candidates.begin(), candidates_idx.begin(),
0385         work_groups_no, work_group_size, compare, find_minimum, queue
0386     );
0387 
0388     // phase II: finding extremum from among the candidates
0389 
0390     // mapping candidates and their indices to host
0391     input_type* candidates_host_ptr =
0392         static_cast<input_type*>(
0393             queue.enqueue_map_buffer(
0394                 candidates.get_buffer(), command_queue::map_read,
0395                 0, work_groups_no * sizeof(input_type)
0396             )
0397         );
0398 
0399     uint_* candidates_idx_host_ptr =
0400         static_cast<uint_*>(
0401             queue.enqueue_map_buffer(
0402                 candidates_idx.get_buffer(), command_queue::map_read,
0403                 0, work_groups_no * sizeof(uint_)
0404             )
0405         );
0406 
0407     input_type* i = candidates_host_ptr;
0408     uint_* idx = candidates_idx_host_ptr;
0409     uint_* extremum_idx = idx;
0410     input_type extremum = *candidates_host_ptr;
0411     i++; idx++;
0412 
0413     // find extremum (serial) from among the candidates on host
0414     if(!find_minimum) {
0415         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
0416             input_type next = *i;
0417             bool compare_result =  next > extremum;
0418             bool equal = next == extremum;
0419             extremum = compare_result ? next : extremum;
0420             extremum_idx = compare_result ? idx : extremum_idx;
0421             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
0422             idx++, i++;
0423         }
0424     }
0425     else {
0426         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
0427             input_type next = *i;
0428             bool compare_result = next < extremum;
0429             bool equal = next == extremum;
0430             extremum = compare_result ? next : extremum;
0431             extremum_idx = compare_result ? idx : extremum_idx;
0432             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
0433             idx++, i++;
0434         }
0435     }
0436 
0437     return first + static_cast<difference_type>(*extremum_idx);
0438 }
0439 
0440 } // end detail namespace
0441 } // end compute namespace
0442 } // end boost namespace
0443 
0444 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP