File indexing completed on 2025-01-18 09:29:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
0013
0014 #include <algorithm>
0015
0016 #include <boost/compute/types.hpp>
0017 #include <boost/compute/command_queue.hpp>
0018 #include <boost/compute/algorithm/copy.hpp>
0019 #include <boost/compute/allocator/pinned_allocator.hpp>
0020 #include <boost/compute/container/vector.hpp>
0021 #include <boost/compute/detail/meta_kernel.hpp>
0022 #include <boost/compute/detail/iterator_range_size.hpp>
0023 #include <boost/compute/detail/parameter_cache.hpp>
0024 #include <boost/compute/memory/local_buffer.hpp>
0025 #include <boost/compute/type_traits/type_name.hpp>
0026 #include <boost/compute/utility/program_cache.hpp>
0027
0028 namespace boost {
0029 namespace compute {
0030 namespace detail {
0031
0032 template<class InputIterator>
0033 bool find_extrema_with_reduce_requirements_met(InputIterator first,
0034 InputIterator last,
0035 command_queue &queue)
0036 {
0037 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0038
0039 const device &device = queue.get_device();
0040
0041
0042
0043 if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
0044 {
0045 return false;
0046 }
0047
0048 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0049
0050 const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
0051
0052 std::string cache_key = std::string("__boost_find_extrema_reduce_")
0053 + type_name<input_type>();
0054
0055 boost::shared_ptr<parameter_cache> parameters =
0056 detail::parameter_cache::get_global_cache(device);
0057
0058
0059 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0060
0061 work_group_size = (std::min)(max_work_group_size, work_group_size);
0062
0063
0064 size_t required_local_mem_size = 0;
0065
0066 required_local_mem_size += sizeof(uint_) * work_group_size;
0067
0068 required_local_mem_size += sizeof(input_type) * work_group_size;
0069
0070
0071
0072 return ((required_local_mem_size * 4) <= local_mem_size);
0073 }
0074
0075
0076
0077
0078
0079
0080
0081 template<class InputIterator, class ResultIterator, class Compare>
0082 inline void find_extrema_with_reduce(InputIterator input,
0083 vector<uint_>::iterator input_idx,
0084 size_t count,
0085 ResultIterator result,
0086 vector<uint_>::iterator result_idx,
0087 size_t work_groups_no,
0088 size_t work_group_size,
0089 Compare compare,
0090 const bool find_minimum,
0091 const bool use_input_idx,
0092 command_queue &queue)
0093 {
0094 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0095
0096 const context &context = queue.get_context();
0097
0098 meta_kernel k("find_extrema_reduce");
0099 size_t count_arg = k.add_arg<uint_>("count");
0100 size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
0101 size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
0102
0103 k <<
0104
0105 k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
0106
0107
0108 k.decl<uint_>("idx") << " = gid;\n" <<
0109
0110 k.decl<input_type>("acc") << ";\n" <<
0111 k.decl<uint_>("acc_idx") << ";\n" <<
0112 "if(gid < count) {\n" <<
0113
0114 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0115 k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
0116 "#else\n" <<
0117 k.var<uint_>("acc_idx") << " = idx;\n" <<
0118 "#endif\n" <<
0119
0120
0121 "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
0122 "idx += get_global_size(0);\n" <<
0123 "}\n" <<
0124
0125 k.decl<bool>("compare_result") << ";\n" <<
0126 k.decl<bool>("equal") << ";\n\n" <<
0127 "while( idx < count ){\n" <<
0128
0129 k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
0130 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0131 k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
0132 "#endif\n" <<
0133
0134
0135 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
0136 "compare_result = " << compare(k.var<input_type>("next"),
0137 k.var<input_type>("acc")) << ";\n" <<
0138 "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0139 "equal = !compare_result && !" <<
0140 compare(k.var<input_type>("acc"),
0141 k.var<input_type>("next")) << ";\n" <<
0142 "# endif\n" <<
0143 "#else\n" <<
0144 "compare_result = " << compare(k.var<input_type>("acc"),
0145 k.var<input_type>("next")) << ";\n" <<
0146 "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0147 "equal = !compare_result && !" <<
0148 compare(k.var<input_type>("next"),
0149 k.var<input_type>("acc")) << ";\n" <<
0150 "# endif\n" <<
0151 "#endif\n" <<
0152
0153
0154 "acc = compare_result ? acc : next;\n" <<
0155 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
0156 "acc_idx = compare_result ? " <<
0157 "acc_idx : " <<
0158 "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
0159 "#else\n" <<
0160 "acc_idx = compare_result ? acc_idx : idx;\n" <<
0161 "#endif\n" <<
0162 "idx += get_global_size(0);\n" <<
0163 "}\n\n" <<
0164
0165
0166 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
0167 "block[lid] = acc;\n" <<
0168 "block_idx[lid] = acc_idx;\n" <<
0169 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0170
0171 k.decl<uint_>("group_offset") <<
0172 " = count - (get_local_size(0) * get_group_id(0));\n\n";
0173
0174 k <<
0175 "#pragma unroll\n"
0176 "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
0177 "offset = offset / 2) {\n" <<
0178 "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
0179 k.decl<input_type>("mine") << " = block[lid];\n" <<
0180 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
0181 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
0182 "compare_result = " << compare(k.var<input_type>("other"),
0183 k.var<input_type>("mine")) << ";\n" <<
0184 "equal = !compare_result && !" <<
0185 compare(k.var<input_type>("mine"),
0186 k.var<input_type>("other")) << ";\n" <<
0187 "#else\n" <<
0188 "compare_result = " << compare(k.var<input_type>("mine"),
0189 k.var<input_type>("other")) << ";\n" <<
0190 "equal = !compare_result && !" <<
0191 compare(k.var<input_type>("other"),
0192 k.var<input_type>("mine")) << ";\n" <<
0193 "#endif\n" <<
0194 "block[lid] = compare_result ? mine : other;\n" <<
0195 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
0196 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
0197 "block_idx[lid] = compare_result ? " <<
0198 "mine_idx : " <<
0199 "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
0200 "}\n"
0201 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0202 "}\n\n" <<
0203
0204
0205 "if(lid == 0){\n" <<
0206 result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
0207 result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
0208 "}";
0209
0210 std::string options;
0211 if(!find_minimum){
0212 options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
0213 }
0214 if(use_input_idx){
0215 options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
0216 }
0217
0218 kernel kernel = k.compile(context, options);
0219
0220 kernel.set_arg(count_arg, static_cast<uint_>(count));
0221 kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
0222 kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
0223
0224 queue.enqueue_1d_range_kernel(kernel,
0225 0,
0226 work_groups_no * work_group_size,
0227 work_group_size);
0228 }
0229
0230 template<class InputIterator, class ResultIterator, class Compare>
0231 inline void find_extrema_with_reduce(InputIterator input,
0232 size_t count,
0233 ResultIterator result,
0234 vector<uint_>::iterator result_idx,
0235 size_t work_groups_no,
0236 size_t work_group_size,
0237 Compare compare,
0238 const bool find_minimum,
0239 command_queue &queue)
0240 {
0241
0242 buffer_iterator<uint_> dummy = result_idx;
0243 return find_extrema_with_reduce(
0244 input, dummy, count, result, result_idx, work_groups_no,
0245 work_group_size, compare, find_minimum, false, queue
0246 );
0247 }
0248
0249
0250 template<class InputIterator, class Compare>
0251 InputIterator find_extrema_with_reduce(InputIterator first,
0252 InputIterator last,
0253 Compare compare,
0254 const bool find_minimum,
0255 command_queue &queue)
0256 {
0257 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0258 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0259
0260 const context &context = queue.get_context();
0261 const device &device = queue.get_device();
0262
0263
0264 const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0265 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0266
0267 const size_t count = detail::iterator_range_size(first, last);
0268
0269 std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
0270 + type_name<input_type>();
0271
0272
0273 boost::shared_ptr<parameter_cache> parameters =
0274 detail::parameter_cache::get_global_cache(device);
0275
0276
0277
0278 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0279 size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
0280
0281
0282 work_group_size = (std::min)(max_work_group_size, work_group_size);
0283 size_t work_groups_no = compute_units_no * work_groups_per_cu;
0284 work_groups_no = (std::min)(
0285 work_groups_no,
0286 static_cast<size_t>(std::ceil(float(count) / work_group_size))
0287 );
0288
0289
0290
0291
0292
0293 vector<input_type> candidates(work_groups_no, context);
0294 vector<uint_> candidates_idx(work_groups_no, context);
0295
0296
0297 find_extrema_with_reduce(
0298 first, count, candidates.begin(), candidates_idx.begin(),
0299 work_groups_no, work_group_size, compare, find_minimum, queue
0300 );
0301
0302
0303
0304
0305 vector<input_type, ::boost::compute::pinned_allocator<input_type> >
0306 result(1, context);
0307 vector<uint_, ::boost::compute::pinned_allocator<uint_> >
0308 result_idx(1, context);
0309
0310
0311 find_extrema_with_reduce(
0312 candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
0313 result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
0314 );
0315
0316
0317 uint_* result_idx_host_ptr =
0318 static_cast<uint_*>(
0319 queue.enqueue_map_buffer(
0320 result_idx.get_buffer(), command_queue::map_read,
0321 0, sizeof(uint_)
0322 )
0323 );
0324
0325 return first + static_cast<difference_type>(*result_idx_host_ptr);
0326 }
0327
0328 template<class InputIterator>
0329 InputIterator find_extrema_with_reduce(InputIterator first,
0330 InputIterator last,
0331 ::boost::compute::less<
0332 typename std::iterator_traits<
0333 InputIterator
0334 >::value_type
0335 >
0336 compare,
0337 const bool find_minimum,
0338 command_queue &queue)
0339 {
0340 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0341 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
0342
0343 const context &context = queue.get_context();
0344 const device &device = queue.get_device();
0345
0346
0347 const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0348 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
0349
0350 const size_t count = detail::iterator_range_size(first, last);
0351
0352 std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
0353 + type_name<input_type>();
0354
0355
0356 boost::shared_ptr<parameter_cache> parameters =
0357 detail::parameter_cache::get_global_cache(device);
0358
0359
0360
0361 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
0362 size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
0363
0364
0365 work_group_size = (std::min)(max_work_group_size, work_group_size);
0366 size_t work_groups_no = compute_units_no * work_groups_per_cu;
0367 work_groups_no = (std::min)(
0368 work_groups_no,
0369 static_cast<size_t>(std::ceil(float(count) / work_group_size))
0370 );
0371
0372
0373
0374
0375
0376
0377 vector<input_type, ::boost::compute::pinned_allocator<input_type> >
0378 candidates(work_groups_no, context);
0379 vector<uint_, ::boost::compute::pinned_allocator <uint_> >
0380 candidates_idx(work_groups_no, context);
0381
0382
0383 find_extrema_with_reduce(
0384 first, count, candidates.begin(), candidates_idx.begin(),
0385 work_groups_no, work_group_size, compare, find_minimum, queue
0386 );
0387
0388
0389
0390
0391 input_type* candidates_host_ptr =
0392 static_cast<input_type*>(
0393 queue.enqueue_map_buffer(
0394 candidates.get_buffer(), command_queue::map_read,
0395 0, work_groups_no * sizeof(input_type)
0396 )
0397 );
0398
0399 uint_* candidates_idx_host_ptr =
0400 static_cast<uint_*>(
0401 queue.enqueue_map_buffer(
0402 candidates_idx.get_buffer(), command_queue::map_read,
0403 0, work_groups_no * sizeof(uint_)
0404 )
0405 );
0406
0407 input_type* i = candidates_host_ptr;
0408 uint_* idx = candidates_idx_host_ptr;
0409 uint_* extremum_idx = idx;
0410 input_type extremum = *candidates_host_ptr;
0411 i++; idx++;
0412
0413
0414 if(!find_minimum) {
0415 while(idx != (candidates_idx_host_ptr + work_groups_no)) {
0416 input_type next = *i;
0417 bool compare_result = next > extremum;
0418 bool equal = next == extremum;
0419 extremum = compare_result ? next : extremum;
0420 extremum_idx = compare_result ? idx : extremum_idx;
0421 extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
0422 idx++, i++;
0423 }
0424 }
0425 else {
0426 while(idx != (candidates_idx_host_ptr + work_groups_no)) {
0427 input_type next = *i;
0428 bool compare_result = next < extremum;
0429 bool equal = next == extremum;
0430 extremum = compare_result ? next : extremum;
0431 extremum_idx = compare_result ? idx : extremum_idx;
0432 extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
0433 idx++, i++;
0434 }
0435 }
0436
0437 return first + static_cast<difference_type>(*extremum_idx);
0438 }
0439
0440 }
0441 }
0442 }
0443
0444 #endif