File indexing completed on 2025-01-18 09:29:57
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/static_assert.hpp>
0017
0018 #include <boost/compute/system.hpp>
0019 #include <boost/compute/functional.hpp>
0020 #include <boost/compute/detail/meta_kernel.hpp>
0021 #include <boost/compute/command_queue.hpp>
0022 #include <boost/compute/container/array.hpp>
0023 #include <boost/compute/container/vector.hpp>
0024 #include <boost/compute/algorithm/copy_n.hpp>
0025 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
0026 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
0027 #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
0028 #include <boost/compute/detail/iterator_range_size.hpp>
0029 #include <boost/compute/memory/local_buffer.hpp>
0030 #include <boost/compute/type_traits/result_of.hpp>
0031 #include <boost/compute/type_traits/is_device_iterator.hpp>
0032
0033 namespace boost {
0034 namespace compute {
0035 namespace detail {
0036
0037 template<class InputIterator, class OutputIterator, class BinaryFunction>
0038 size_t reduce(InputIterator first,
0039 size_t count,
0040 OutputIterator result,
0041 size_t block_size,
0042 BinaryFunction function,
0043 command_queue &queue)
0044 {
0045 typedef typename
0046 std::iterator_traits<InputIterator>::value_type
0047 input_type;
0048 typedef typename
0049 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0050 result_type;
0051
0052 const context &context = queue.get_context();
0053 size_t block_count = count / 2 / block_size;
0054 size_t total_block_count =
0055 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
0056
0057 if(block_count != 0){
0058 meta_kernel k("block_reduce");
0059 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
0060 size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
0061
0062 k <<
0063 "const uint gid = get_global_id(0);\n" <<
0064 "const uint lid = get_local_id(0);\n" <<
0065
0066
0067 "block[lid] = " <<
0068 function(first[k.make_var<uint_>("gid*2+0")],
0069 first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
0070
0071
0072 "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
0073 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0074 " uint mask = (i << 1) - 1;\n" <<
0075 " if((lid & mask) == 0){\n" <<
0076 " block[lid] = " <<
0077 function(k.expr<input_type>("block[lid]"),
0078 k.expr<input_type>("block[lid+i]")) << ";\n" <<
0079 " }\n" <<
0080 "}\n" <<
0081
0082
0083 "if(lid == 0)\n" <<
0084 " output[get_group_id(0)] = block[0];\n";
0085
0086 kernel kernel = k.compile(context);
0087 kernel.set_arg(output_arg, result.get_buffer());
0088 kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
0089
0090 queue.enqueue_1d_range_kernel(kernel,
0091 0,
0092 block_count * block_size,
0093 block_size);
0094 }
0095
0096
0097 if(block_count * block_size * 2 < count){
0098 size_t last_block_start = block_count * block_size * 2;
0099
0100 meta_kernel k("extra_serial_reduce");
0101 size_t count_arg = k.add_arg<uint_>("count");
0102 size_t offset_arg = k.add_arg<uint_>("offset");
0103 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
0104 size_t output_offset_arg = k.add_arg<uint_>("output_offset");
0105
0106 k <<
0107 k.decl<result_type>("result") << " = \n" <<
0108 first[k.expr<uint_>("offset")] << ";\n" <<
0109 "for(uint i = offset + 1; i < count; i++)\n" <<
0110 " result = " <<
0111 function(k.var<result_type>("result"),
0112 first[k.var<uint_>("i")]) << ";\n" <<
0113 "output[output_offset] = result;\n";
0114
0115 kernel kernel = k.compile(context);
0116 kernel.set_arg(count_arg, static_cast<uint_>(count));
0117 kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
0118 kernel.set_arg(output_arg, result.get_buffer());
0119 kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
0120
0121 queue.enqueue_task(kernel);
0122 }
0123
0124 return total_block_count;
0125 }
0126
0127 template<class InputIterator, class BinaryFunction>
0128 inline vector<
0129 typename boost::compute::result_of<
0130 BinaryFunction(
0131 typename std::iterator_traits<InputIterator>::value_type,
0132 typename std::iterator_traits<InputIterator>::value_type
0133 )
0134 >::type
0135 >
0136 block_reduce(InputIterator first,
0137 size_t count,
0138 size_t block_size,
0139 BinaryFunction function,
0140 command_queue &queue)
0141 {
0142 typedef typename
0143 std::iterator_traits<InputIterator>::value_type
0144 input_type;
0145 typedef typename
0146 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0147 result_type;
0148
0149 const context &context = queue.get_context();
0150 size_t total_block_count =
0151 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
0152 vector<result_type> result_vector(total_block_count, context);
0153
0154 reduce(first, count, result_vector.begin(), block_size, function, queue);
0155
0156 return result_vector;
0157 }
0158
0159
0160 template<class InputIterator, class OutputIterator, class BinaryFunction>
0161 inline void generic_reduce(InputIterator first,
0162 InputIterator last,
0163 OutputIterator result,
0164 BinaryFunction function,
0165 command_queue &queue)
0166 {
0167 typedef typename
0168 std::iterator_traits<InputIterator>::value_type
0169 input_type;
0170 typedef typename
0171 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
0172 result_type;
0173
0174 const device &device = queue.get_device();
0175 const context &context = queue.get_context();
0176
0177 size_t count = detail::iterator_range_size(first, last);
0178
0179 if(device.type() & device::cpu){
0180 array<result_type, 1> value(context);
0181 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
0182 boost::compute::copy_n(value.begin(), 1, result, queue);
0183 }
0184 else {
0185 size_t block_size = 256;
0186
0187
0188 vector<result_type> results = detail::block_reduce(first,
0189 count,
0190 block_size,
0191 function,
0192 queue);
0193
0194 if(results.size() > 1){
0195 detail::inplace_reduce(results.begin(),
0196 results.end(),
0197 function,
0198 queue);
0199 }
0200
0201 boost::compute::copy_n(results.begin(), 1, result, queue);
0202 }
0203 }
0204
0205 template<class InputIterator, class OutputIterator, class T>
0206 inline void dispatch_reduce(InputIterator first,
0207 InputIterator last,
0208 OutputIterator result,
0209 const plus<T> &function,
0210 command_queue &queue)
0211 {
0212 const context &context = queue.get_context();
0213 const device &device = queue.get_device();
0214
0215
0216 array<T, 1> value(context);
0217 if(device.type() & device::cpu){
0218 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
0219 }
0220 else {
0221 reduce_on_gpu(first, last, value.begin(), function, queue);
0222 }
0223
0224
0225 copy_n(value.begin(), 1, result, queue);
0226 }
0227
0228 template<class InputIterator, class OutputIterator, class BinaryFunction>
0229 inline void dispatch_reduce(InputIterator first,
0230 InputIterator last,
0231 OutputIterator result,
0232 BinaryFunction function,
0233 command_queue &queue)
0234 {
0235 generic_reduce(first, last, result, function, queue);
0236 }
0237
0238 }
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275 template<class InputIterator, class OutputIterator, class BinaryFunction>
0276 inline void reduce(InputIterator first,
0277 InputIterator last,
0278 OutputIterator result,
0279 BinaryFunction function,
0280 command_queue &queue = system::default_queue())
0281 {
0282 BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
0283 if(first == last){
0284 return;
0285 }
0286
0287 detail::dispatch_reduce(first, last, result, function, queue);
0288 }
0289
0290
0291 template<class InputIterator, class OutputIterator>
0292 inline void reduce(InputIterator first,
0293 InputIterator last,
0294 OutputIterator result,
0295 command_queue &queue = system::default_queue())
0296 {
0297 BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
0298 typedef typename std::iterator_traits<InputIterator>::value_type T;
0299
0300 if(first == last){
0301 return;
0302 }
0303
0304 detail::dispatch_reduce(first, last, result, plus<T>(), queue);
0305 }
0306
0307 }
0308 }
0309
0310 #endif