File indexing completed on 2025-01-18 09:29:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/compute/utility/source.hpp>
0017 #include <boost/compute/program.hpp>
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/detail/vendor.hpp>
0020 #include <boost/compute/detail/parameter_cache.hpp>
0021 #include <boost/compute/detail/work_size.hpp>
0022 #include <boost/compute/detail/meta_kernel.hpp>
0023 #include <boost/compute/type_traits/type_name.hpp>
0024 #include <boost/compute/utility/program_cache.hpp>
0025
0026 namespace boost {
0027 namespace compute {
0028 namespace detail {
0029
0030
0031
0032 template<typename T,bool isNvidiaDevice>
0033 struct ReduceBody
0034 {
0035 static std::string body()
0036 {
0037 std::stringstream k;
0038
0039 k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
0040 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0041 " uint mask = (i << 1) - 1;\n" <<
0042 " if((lid & mask) == 0){\n" <<
0043 " scratch[lid] += scratch[lid+i];\n" <<
0044 " }\n" <<
0045 "}\n";
0046 return k.str();
0047 }
0048 };
0049
0050
0051
0052
0053
0054 template<typename T>
0055 struct ReduceBody<T,true>
0056 {
0057 static std::string body()
0058 {
0059 std::stringstream k;
0060
0061
0062
0063 k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0064 "if(TPB >= 1024){\n" <<
0065 "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0066 "if(TPB >= 512){\n" <<
0067 "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0068 "if(TPB >= 256){\n" <<
0069 "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
0070 "if(TPB >= 128){\n" <<
0071 "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
0072
0073
0074 "if(lid < 32){\n" <<
0075
0076 "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
0077 "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
0078 "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
0079 "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
0080 "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
0081 "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
0082 "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
0083 "}\n";
0084 return k.str();
0085 }
0086 };
0087
0088 template<class InputIterator, class Function>
0089 inline void initial_reduce(InputIterator first,
0090 InputIterator last,
0091 buffer result,
0092 const Function &function,
0093 kernel &reduce_kernel,
0094 const uint_ vpt,
0095 const uint_ tpb,
0096 command_queue &queue)
0097 {
0098 (void) function;
0099 (void) reduce_kernel;
0100
0101 typedef typename std::iterator_traits<InputIterator>::value_type Arg;
0102 typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
0103
0104 size_t count = std::distance(first, last);
0105 detail::meta_kernel k("initial_reduce");
0106 k.add_set_arg<const uint_>("count", uint_(count));
0107 size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
0108
0109 k <<
0110 k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
0111 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0112
0113 "__local " << type_name<T>() << " scratch[TPB];\n" <<
0114
0115
0116 k.decl<T>("sum") << " = 0;\n" <<
0117 "for(uint i = 0; i < VPT; i++){\n" <<
0118 " if(offset + lid + i*TPB < count){\n" <<
0119 " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
0120 " }\n" <<
0121 "}\n" <<
0122
0123 "scratch[lid] = sum;\n" <<
0124
0125
0126 ReduceBody<T,false>::body() <<
0127
0128
0129 "if(lid == 0){\n" <<
0130 " output[get_group_id(0)] = scratch[0];\n" <<
0131 "}\n";
0132
0133 const context &context = queue.get_context();
0134 std::stringstream options;
0135 options << "-DVPT=" << vpt << " -DTPB=" << tpb;
0136 kernel generic_reduce_kernel = k.compile(context, options.str());
0137 generic_reduce_kernel.set_arg(output_arg, result);
0138
0139 size_t work_size = calculate_work_size(count, vpt, tpb);
0140
0141 queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
0142 }
0143
0144 template<class T>
0145 inline void initial_reduce(const buffer_iterator<T> &first,
0146 const buffer_iterator<T> &last,
0147 const buffer &result,
0148 const plus<T> &function,
0149 kernel &reduce_kernel,
0150 const uint_ vpt,
0151 const uint_ tpb,
0152 command_queue &queue)
0153 {
0154 (void) function;
0155
0156 size_t count = std::distance(first, last);
0157
0158 reduce_kernel.set_arg(0, first.get_buffer());
0159 reduce_kernel.set_arg(1, uint_(first.get_index()));
0160 reduce_kernel.set_arg(2, uint_(count));
0161 reduce_kernel.set_arg(3, result);
0162 reduce_kernel.set_arg(4, uint_(0));
0163
0164 size_t work_size = calculate_work_size(count, vpt, tpb);
0165
0166 queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
0167 }
0168
0169 template<class InputIterator, class T, class Function>
0170 inline void reduce_on_gpu(InputIterator first,
0171 InputIterator last,
0172 buffer_iterator<T> result,
0173 Function function,
0174 command_queue &queue)
0175 {
0176 const device &device = queue.get_device();
0177 const context &context = queue.get_context();
0178
0179 detail::meta_kernel k("reduce");
0180 k.add_arg<const T*>(memory_object::global_memory, "input");
0181 k.add_arg<const uint_>("offset");
0182 k.add_arg<const uint_>("count");
0183 k.add_arg<T*>(memory_object::global_memory, "output");
0184 k.add_arg<const uint_>("output_offset");
0185
0186 k <<
0187 k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
0188 "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
0189 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0190
0191 "__local " << type_name<T>() << " scratch[TPB];\n" <<
0192
0193 k.decl<T>("sum") << " = 0;\n" <<
0194 "for(uint i = 0; i < VPT; i++){\n" <<
0195 " if(block_offset + lid + i*TPB < count){\n" <<
0196 " sum = sum + block[lid+i*TPB]; \n" <<
0197 " }\n" <<
0198 "}\n" <<
0199
0200 "scratch[lid] = sum;\n";
0201
0202
0203 if(is_nvidia_device(device))
0204 k << ReduceBody<T,true>::body();
0205 else
0206 k << ReduceBody<T,false>::body();
0207
0208 k <<
0209
0210 "if(lid == 0){\n" <<
0211 " output[output_offset + get_group_id(0)] = scratch[0];\n" <<
0212 "}\n";
0213
0214 std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
0215
0216
0217 boost::shared_ptr<parameter_cache> parameters =
0218 detail::parameter_cache::get_global_cache(device);
0219
0220 uint_ vpt = parameters->get(cache_key, "vpt", 8);
0221 uint_ tpb = parameters->get(cache_key, "tpb", 128);
0222
0223
0224 std::stringstream options;
0225 options << "-DT=" << type_name<T>()
0226 << " -DVPT=" << vpt
0227 << " -DTPB=" << tpb;
0228
0229
0230 boost::shared_ptr<program_cache> cache =
0231 program_cache::get_global_cache(context);
0232
0233 program reduce_program = cache->get_or_build(
0234 cache_key, options.str(), k.source(), context
0235 );
0236
0237
0238 kernel reduce_kernel(reduce_program, "reduce");
0239
0240 size_t count = std::distance(first, last);
0241
0242
0243 buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
0244 initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
0245
0246
0247 count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
0248
0249
0250 const buffer *input_buffer = &ping;
0251 buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
0252 const buffer *output_buffer = &pong;
0253 if(count > vpt * tpb){
0254 while(count > vpt * tpb){
0255 reduce_kernel.set_arg(0, *input_buffer);
0256 reduce_kernel.set_arg(1, uint_(0));
0257 reduce_kernel.set_arg(2, uint_(count));
0258 reduce_kernel.set_arg(3, *output_buffer);
0259 reduce_kernel.set_arg(4, uint_(0));
0260
0261 size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
0262 if(work_size % tpb != 0){
0263 work_size += tpb - work_size % tpb;
0264 }
0265 queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
0266
0267 std::swap(input_buffer, output_buffer);
0268 count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
0269 }
0270 }
0271
0272
0273 reduce_kernel.set_arg(0, *input_buffer);
0274 reduce_kernel.set_arg(1, uint_(0));
0275 reduce_kernel.set_arg(2, uint_(count));
0276 reduce_kernel.set_arg(3, result.get_buffer());
0277 reduce_kernel.set_arg(4, uint_(result.get_index()));
0278
0279 queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
0280 }
0281
0282 }
0283 }
0284 }
0285
0286 #endif