algorithm/detail/reduce_on_gpu.hpp

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/compute/utility/source.hpp>
0017 #include <boost/compute/program.hpp>
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/detail/vendor.hpp>
0020 #include <boost/compute/detail/parameter_cache.hpp>
0021 #include <boost/compute/detail/work_size.hpp>
0022 #include <boost/compute/detail/meta_kernel.hpp>
0023 #include <boost/compute/type_traits/type_name.hpp>
0024 #include <boost/compute/utility/program_cache.hpp>
0025
0026 namespace boost {
0027 namespace compute {
0028 namespace detail {
0029
0030 /// \internal
0031 /// body reduction inside a warp
0032 template<typename T,bool isNvidiaDevice>
0033 struct ReduceBody
0034 {
0035     static std::string body()
0036     {
0037         std::stringstream k;
0038         // local reduction
0039         k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
0040              "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
0041              "   uint mask = (i << 1) - 1;\n"      <<
0042              "   if((lid & mask) == 0){\n"         <<
0043              "       scratch[lid] += scratch[lid+i];\n" <<
0044              "   }\n" <<
0045             "}\n";
0046         return k.str();
0047     }
0048 };
0049
0050 /// \internal
0051 /// body reduction inside a warp
0052 /// for nvidia device we can use the "unsafe"
0053 /// memory optimisation
0054 template<typename T>
0055 struct ReduceBody<T,true>
0056 {
0057     static std::string body()
0058     {
0059         std::stringstream k;
0060         // local reduction
0061         // we use TPB to compile only useful instruction
0062         // local reduction when size is greater than warp size
0063         k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0064         "if(TPB >= 1024){\n" <<
0065             "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0066          "if(TPB >= 512){\n" <<
0067             "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0068          "if(TPB >= 256){\n" <<
0069             "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0070          "if(TPB >= 128){\n" <<
0071             "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
0072
0073         // warp reduction
0074         "if(lid < 32){\n" <<
0075             // volatile this way we don't need any barrier
0076             "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
0077             "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
0078             "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
0079             "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
0080             "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
0081             "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
0082             "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
0083         "}\n";
0084         return k.str();
0085     }
0086 };
0087
0088 template<class InputIterator, class Function>
0089 inline void initial_reduce(InputIterator first,
0090                            InputIterator last,
0091                            buffer result,
0092                            const Function &function,
0093                            kernel &reduce_kernel,
0094                            const uint_ vpt,
0095                            const uint_ tpb,
0096                            command_queue &queue)
0097 {
0098     (void) function;
0099     (void) reduce_kernel;
0100
0101     typedef typename std::iterator_traits<InputIterator>::value_type Arg;
0102     typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
0103
0104     size_t count = std::distance(first, last);
0105     detail::meta_kernel k("initial_reduce");
0106     k.add_set_arg<const uint_>("count", uint_(count));
0107     size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
0108
0109     k <<
0110         k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
0111         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0112
0113         "__local " << type_name<T>() << " scratch[TPB];\n" <<
0114
0115         // private reduction
0116         k.decl<T>("sum") << " = 0;\n" <<
0117         "for(uint i = 0; i < VPT; i++){\n" <<
0118         "    if(offset + lid + i*TPB < count){\n" <<
0119         "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
0120         "    }\n" <<
0121         "}\n" <<
0122
0123         "scratch[lid] = sum;\n" <<
0124
0125         // local reduction
0126         ReduceBody<T,false>::body() <<
0127
0128         // write sum to output
0129         "if(lid == 0){\n" <<
0130         "    output[get_group_id(0)] = scratch[0];\n" <<
0131         "}\n";
0132
0133     const context &context = queue.get_context();
0134     std::stringstream options;
0135     options << "-DVPT=" << vpt << " -DTPB=" << tpb;
0136     kernel generic_reduce_kernel = k.compile(context, options.str());
0137     generic_reduce_kernel.set_arg(output_arg, result);
0138
0139     size_t work_size = calculate_work_size(count, vpt, tpb);
0140
0141     queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
0142 }
0143
0144 template<class T>
0145 inline void initial_reduce(const buffer_iterator<T> &first,
0146                            const buffer_iterator<T> &last,
0147                            const buffer &result,
0148                            const plus<T> &function,
0149                            kernel &reduce_kernel,
0150                            const uint_ vpt,
0151                            const uint_ tpb,
0152                            command_queue &queue)
0153 {
0154     (void) function;
0155
0156     size_t count = std::distance(first, last);
0157
0158     reduce_kernel.set_arg(0, first.get_buffer());
0159     reduce_kernel.set_arg(1, uint_(first.get_index()));
0160     reduce_kernel.set_arg(2, uint_(count));
0161     reduce_kernel.set_arg(3, result);
0162     reduce_kernel.set_arg(4, uint_(0));
0163
0164     size_t work_size = calculate_work_size(count, vpt, tpb);
0165
0166     queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
0167 }
0168
0169 template<class InputIterator, class T, class Function>
0170 inline void reduce_on_gpu(InputIterator first,
0171                           InputIterator last,
0172                           buffer_iterator<T> result,
0173                           Function function,
0174                           command_queue &queue)
0175 {
0176     const device &device = queue.get_device();
0177     const context &context = queue.get_context();
0178
0179     detail::meta_kernel k("reduce");
0180     k.add_arg<const T*>(memory_object::global_memory, "input");
0181     k.add_arg<const uint_>("offset");
0182     k.add_arg<const uint_>("count");
0183     k.add_arg<T*>(memory_object::global_memory, "output");
0184     k.add_arg<const uint_>("output_offset");
0185
0186     k <<
0187         k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
0188         "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
0189         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0190
0191         "__local " << type_name<T>() << " scratch[TPB];\n" <<
0192         // private reduction
0193         k.decl<T>("sum") << " = 0;\n" <<
0194         "for(uint i = 0; i < VPT; i++){\n" <<
0195         "    if(block_offset + lid + i*TPB < count){\n" <<
0196         "        sum = sum + block[lid+i*TPB]; \n" <<
0197         "    }\n" <<
0198         "}\n" <<
0199
0200         "scratch[lid] = sum;\n";
0201
0202     // discrimination on vendor name
0203     if(is_nvidia_device(device))
0204         k << ReduceBody<T,true>::body();
0205     else
0206         k << ReduceBody<T,false>::body();
0207
0208     k <<
0209         // write sum to output
0210          "if(lid == 0){\n" <<
0211          "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
0212          "}\n";
0213
0214     std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
0215
0216     // load parameters
0217     boost::shared_ptr<parameter_cache> parameters =
0218         detail::parameter_cache::get_global_cache(device);
0219
0220     uint_ vpt = parameters->get(cache_key, "vpt", 8);
0221     uint_ tpb = parameters->get(cache_key, "tpb", 128);
0222
0223     // reduce program compiler flags
0224     std::stringstream options;
0225     options << "-DT=" << type_name<T>()
0226             << " -DVPT=" << vpt
0227             << " -DTPB=" << tpb;
0228
0229     // load program
0230     boost::shared_ptr<program_cache> cache =
0231         program_cache::get_global_cache(context);
0232
0233     program reduce_program = cache->get_or_build(
0234         cache_key, options.str(), k.source(), context
0235     );
0236
0237     // create reduce kernel
0238     kernel reduce_kernel(reduce_program, "reduce");
0239
0240     size_t count = std::distance(first, last);
0241
0242     // first pass, reduce from input to ping
0243     buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
0244     initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
0245
0246     // update count after initial reduce
0247     count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
0248
0249     // middle pass(es), reduce between ping and pong
0250     const buffer *input_buffer = &ping;
0251     buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
0252     const buffer *output_buffer = &pong;
0253     if(count > vpt * tpb){
0254         while(count > vpt * tpb){
0255             reduce_kernel.set_arg(0, *input_buffer);
0256             reduce_kernel.set_arg(1, uint_(0));
0257             reduce_kernel.set_arg(2, uint_(count));
0258             reduce_kernel.set_arg(3, *output_buffer);
0259             reduce_kernel.set_arg(4, uint_(0));
0260
0261             size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
0262             if(work_size % tpb != 0){
0263                 work_size += tpb - work_size % tpb;
0264             }
0265             queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
0266
0267             std::swap(input_buffer, output_buffer);
0268             count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
0269         }
0270     }
0271
0272     // final pass, reduce from ping/pong to result
0273     reduce_kernel.set_arg(0, *input_buffer);
0274     reduce_kernel.set_arg(1, uint_(0));
0275     reduce_kernel.set_arg(2, uint_(count));
0276     reduce_kernel.set_arg(3, result.get_buffer());
0277     reduce_kernel.set_arg(4, uint_(result.get_index()));
0278
0279     queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
0280 }
0281
0282 } // end detail namespace
0283 } // end compute namespace
0284 } // end boost namespace
0285
0286 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP