Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:29:57

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010 
0011 #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
0013 
0014 #include <iterator>
0015 
0016 #include <boost/static_assert.hpp>
0017 
0018 #include <boost/compute/system.hpp>
0019 #include <boost/compute/functional.hpp>
0020 #include <boost/compute/detail/meta_kernel.hpp>
0021 #include <boost/compute/command_queue.hpp>
0022 #include <boost/compute/container/array.hpp>
0023 #include <boost/compute/container/vector.hpp>
0024 #include <boost/compute/algorithm/copy_n.hpp>
0025 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
0026 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
0027 #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
0028 #include <boost/compute/detail/iterator_range_size.hpp>
0029 #include <boost/compute/memory/local_buffer.hpp>
0030 #include <boost/compute/type_traits/result_of.hpp>
0031 #include <boost/compute/type_traits/is_device_iterator.hpp>
0032 
0033 namespace boost {
0034 namespace compute {
0035 namespace detail {
0036 
0037 template<class InputIterator, class OutputIterator, class BinaryFunction>
0038 size_t reduce(InputIterator first,
0039               size_t count,
0040               OutputIterator result,
0041               size_t block_size,
0042               BinaryFunction function,
0043               command_queue &queue)
0044 {
0045     typedef typename
0046         std::iterator_traits<InputIterator>::value_type
0047         input_type;
0048     typedef typename
0049         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0050         result_type;
0051 
0052     const context &context = queue.get_context();
0053     size_t block_count = count / 2 / block_size;
0054     size_t total_block_count =
0055         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
0056 
0057     if(block_count != 0){
0058         meta_kernel k("block_reduce");
0059         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
0060         size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
0061 
0062         k <<
0063             "const uint gid = get_global_id(0);\n" <<
0064             "const uint lid = get_local_id(0);\n" <<
0065 
0066             // copy values to local memory
0067             "block[lid] = " <<
0068                 function(first[k.make_var<uint_>("gid*2+0")],
0069                          first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
0070 
0071             // perform reduction
0072             "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
0073             "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0074             "    uint mask = (i << 1) - 1;\n" <<
0075             "    if((lid & mask) == 0){\n" <<
0076             "        block[lid] = " <<
0077                          function(k.expr<input_type>("block[lid]"),
0078                                   k.expr<input_type>("block[lid+i]")) << ";\n" <<
0079             "    }\n" <<
0080             "}\n" <<
0081 
0082             // write block result to global output
0083             "if(lid == 0)\n" <<
0084             "    output[get_group_id(0)] = block[0];\n";
0085 
0086         kernel kernel = k.compile(context);
0087         kernel.set_arg(output_arg, result.get_buffer());
0088         kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
0089 
0090         queue.enqueue_1d_range_kernel(kernel,
0091                                       0,
0092                                       block_count * block_size,
0093                                       block_size);
0094     }
0095 
0096     // serially reduce any leftovers
0097     if(block_count * block_size * 2 < count){
0098         size_t last_block_start = block_count * block_size * 2;
0099 
0100         meta_kernel k("extra_serial_reduce");
0101         size_t count_arg = k.add_arg<uint_>("count");
0102         size_t offset_arg = k.add_arg<uint_>("offset");
0103         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
0104         size_t output_offset_arg = k.add_arg<uint_>("output_offset");
0105 
0106         k <<
0107             k.decl<result_type>("result") << " = \n" <<
0108                 first[k.expr<uint_>("offset")] << ";\n" <<
0109             "for(uint i = offset + 1; i < count; i++)\n" <<
0110             "    result = " <<
0111                      function(k.var<result_type>("result"),
0112                               first[k.var<uint_>("i")]) << ";\n" <<
0113             "output[output_offset] = result;\n";
0114 
0115         kernel kernel = k.compile(context);
0116         kernel.set_arg(count_arg, static_cast<uint_>(count));
0117         kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
0118         kernel.set_arg(output_arg, result.get_buffer());
0119         kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
0120 
0121         queue.enqueue_task(kernel);
0122     }
0123 
0124     return total_block_count;
0125 }
0126 
0127 template<class InputIterator, class BinaryFunction>
0128 inline vector<
0129     typename boost::compute::result_of<
0130         BinaryFunction(
0131             typename std::iterator_traits<InputIterator>::value_type,
0132             typename std::iterator_traits<InputIterator>::value_type
0133         )
0134     >::type
0135 >
0136 block_reduce(InputIterator first,
0137              size_t count,
0138              size_t block_size,
0139              BinaryFunction function,
0140              command_queue &queue)
0141 {
0142     typedef typename
0143         std::iterator_traits<InputIterator>::value_type
0144         input_type;
0145     typedef typename
0146         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0147         result_type;
0148 
0149     const context &context = queue.get_context();
0150     size_t total_block_count =
0151         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
0152     vector<result_type> result_vector(total_block_count, context);
0153 
0154     reduce(first, count, result_vector.begin(), block_size, function, queue);
0155 
0156     return result_vector;
0157 }
0158 
0159 // Space complexity: O( ceil(n / 2 / 256) )
0160 template<class InputIterator, class OutputIterator, class BinaryFunction>
0161 inline void generic_reduce(InputIterator first,
0162                            InputIterator last,
0163                            OutputIterator result,
0164                            BinaryFunction function,
0165                            command_queue &queue)
0166 {
0167     typedef typename
0168         std::iterator_traits<InputIterator>::value_type
0169         input_type;
0170     typedef typename
0171         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0172         result_type;
0173 
0174     const device &device = queue.get_device();
0175     const context &context = queue.get_context();
0176 
0177     size_t count = detail::iterator_range_size(first, last);
0178 
0179     if(device.type() & device::cpu){
0180         array<result_type, 1> value(context);
0181         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
0182         boost::compute::copy_n(value.begin(), 1, result, queue);
0183     }
0184     else {
0185         size_t block_size = 256;
0186 
0187         // first pass
0188         vector<result_type> results = detail::block_reduce(first,
0189                                                            count,
0190                                                            block_size,
0191                                                            function,
0192                                                            queue);
0193 
0194         if(results.size() > 1){
0195             detail::inplace_reduce(results.begin(),
0196                                    results.end(),
0197                                    function,
0198                                    queue);
0199         }
0200 
0201         boost::compute::copy_n(results.begin(), 1, result, queue);
0202     }
0203 }
0204 
0205 template<class InputIterator, class OutputIterator, class T>
0206 inline void dispatch_reduce(InputIterator first,
0207                             InputIterator last,
0208                             OutputIterator result,
0209                             const plus<T> &function,
0210                             command_queue &queue)
0211 {
0212     const context &context = queue.get_context();
0213     const device &device = queue.get_device();
0214 
0215     // reduce to temporary buffer on device
0216     array<T, 1> value(context);
0217     if(device.type() & device::cpu){
0218         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
0219     }
0220     else {
0221         reduce_on_gpu(first, last, value.begin(), function, queue);
0222     }
0223 
0224     // copy to result iterator
0225     copy_n(value.begin(), 1, result, queue);
0226 }
0227 
0228 template<class InputIterator, class OutputIterator, class BinaryFunction>
0229 inline void dispatch_reduce(InputIterator first,
0230                             InputIterator last,
0231                             OutputIterator result,
0232                             BinaryFunction function,
0233                             command_queue &queue)
0234 {
0235     generic_reduce(first, last, result, function, queue);
0236 }
0237 
0238 } // end detail namespace
0239 
0240 /// Returns the result of applying \p function to the elements in the
0241 /// range [\p first, \p last).
0242 ///
0243 /// If no function is specified, \c plus will be used.
0244 ///
0245 /// \param first first element in the input range
0246 /// \param last last element in the input range
0247 /// \param result iterator pointing to the output
0248 /// \param function binary reduction function
0249 /// \param queue command queue to perform the operation
0250 ///
0251 /// The \c reduce() algorithm assumes that the binary reduction function is
0252 /// associative. When used with non-associative functions the result may
0253 /// be non-deterministic and vary in precision. Notably this affects the
0254 /// \c plus<float>() function as floating-point addition is not associative
0255 /// and may produce slightly different results than a serial algorithm.
0256 ///
0257 /// This algorithm supports both host and device iterators for the
0258 /// result argument. This allows for values to be reduced and copied
0259 /// to the host all with a single function call.
0260 ///
0261 /// For example, to calculate the sum of the values in a device vector and
0262 /// copy the result to a value on the host:
0263 ///
0264 /// \snippet test/test_reduce.cpp sum_int
0265 ///
0266 /// Note that while the the \c reduce() algorithm is conceptually identical to
0267 /// the \c accumulate() algorithm, its implementation is substantially more
0268 /// efficient on parallel hardware. For more information, see the documentation
0269 /// on the \c accumulate() algorithm.
0270 ///
0271 /// Space complexity on GPUs: \Omega(n)<br>
0272 /// Space complexity on CPUs: \Omega(1)
0273 ///
0274 /// \see accumulate()
0275 template<class InputIterator, class OutputIterator, class BinaryFunction>
0276 inline void reduce(InputIterator first,
0277                    InputIterator last,
0278                    OutputIterator result,
0279                    BinaryFunction function,
0280                    command_queue &queue = system::default_queue())
0281 {
0282     BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
0283     if(first == last){
0284         return;
0285     }
0286 
0287     detail::dispatch_reduce(first, last, result, function, queue);
0288 }
0289 
0290 /// \overload
0291 template<class InputIterator, class OutputIterator>
0292 inline void reduce(InputIterator first,
0293                    InputIterator last,
0294                    OutputIterator result,
0295                    command_queue &queue = system::default_queue())
0296 {
0297     BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
0298     typedef typename std::iterator_traits<InputIterator>::value_type T;
0299 
0300     if(first == last){
0301         return;
0302     }
0303 
0304     detail::dispatch_reduce(first, last, result, plus<T>(), queue);
0305 }
0306 
0307 } // end compute namespace
0308 } // end boost namespace
0309 
0310 #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP