algorithm/detail/inplace_reduce.hpp

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/utility/result_of.hpp>
0017
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/container/vector.hpp>
0020 #include <boost/compute/detail/iterator_range_size.hpp>
0021 #include <boost/compute/memory/local_buffer.hpp>
0022
0023 namespace boost {
0024 namespace compute {
0025 namespace detail {
0026
0027 template<class Iterator, class BinaryFunction>
0028 inline void inplace_reduce(Iterator first,
0029                            Iterator last,
0030                            BinaryFunction function,
0031                            command_queue &queue)
0032 {
0033     typedef typename
0034         std::iterator_traits<Iterator>::value_type
0035         value_type;
0036
0037     size_t input_size = iterator_range_size(first, last);
0038     if(input_size < 2){
0039         return;
0040     }
0041
0042     const context &context = queue.get_context();
0043
0044     size_t block_size = 64;
0045     size_t values_per_thread = 8;
0046     size_t block_count = input_size / (block_size * values_per_thread);
0047     if(block_count * block_size * values_per_thread != input_size)
0048         block_count++;
0049
0050     vector<value_type> output(block_count, context);
0051
0052     meta_kernel k("inplace_reduce");
0053     size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
0054     size_t input_size_arg = k.add_arg<const uint_>("input_size");
0055     size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
0056     size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
0057     k <<
0058         "const uint gid = get_global_id(0);\n" <<
0059         "const uint lid = get_local_id(0);\n" <<
0060         "const uint values_per_thread =\n"
0061             << uint_(values_per_thread) << ";\n" <<
0062
0063         // thread reduce
0064         "const uint index = gid * values_per_thread;\n" <<
0065         "if(index < input_size){\n" <<
0066             k.decl<value_type>("sum") << " = input[index];\n" <<
0067             "for(uint i = 1;\n" <<
0068                  "i < values_per_thread && (index + i) < input_size;\n" <<
0069                  "i++){\n" <<
0070             "    sum = " <<
0071                      function(k.var<value_type>("sum"),
0072                               k.var<value_type>("input[index+i]")) << ";\n" <<
0073             "}\n" <<
0074             "scratch[lid] = sum;\n" <<
0075         "}\n" <<
0076
0077         // local reduce
0078         "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
0079         "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0080         "    uint mask = (i << 1) - 1;\n" <<
0081         "    uint next_index = (gid + i) * values_per_thread;\n"
0082         "    if((lid & mask) == 0 && next_index < input_size){\n" <<
0083         "        scratch[lid] = " <<
0084                      function(k.var<value_type>("scratch[lid]"),
0085                               k.var<value_type>("scratch[lid+i]")) << ";\n" <<
0086         "    }\n" <<
0087         "}\n" <<
0088
0089         // write output for block
0090         "if(lid == 0){\n" <<
0091         "    output[get_group_id(0)] = scratch[0];\n" <<
0092         "}\n"
0093         ;
0094
0095     const buffer *input_buffer = &first.get_buffer();
0096     const buffer *output_buffer = &output.get_buffer();
0097
0098     kernel kernel = k.compile(context);
0099
0100     while(input_size > 1){
0101         kernel.set_arg(input_arg, *input_buffer);
0102         kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
0103         kernel.set_arg(output_arg, *output_buffer);
0104         kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
0105
0106         queue.enqueue_1d_range_kernel(kernel,
0107                                       0,
0108                                       block_count * block_size,
0109                                       block_size);
0110
0111         input_size =
0112             static_cast<size_t>(
0113                 std::ceil(float(input_size) / (block_size * values_per_thread)
0114             )
0115         );
0116
0117         block_count = input_size / (block_size * values_per_thread);
0118         if(block_count * block_size * values_per_thread != input_size)
0119             block_count++;
0120
0121         std::swap(input_buffer, output_buffer);
0122     }
0123
0124     if(input_buffer != &first.get_buffer()){
0125         ::boost::compute::copy(output.begin(),
0126                                output.begin() + 1,
0127                                first,
0128                                queue);
0129     }
0130 }
0131
0132 } // end detail namespace
0133 } // end compute namespace
0134 } // end boost namespace
0135
0136 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP