File indexing completed on 2025-01-18 09:29:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/compute/types.hpp>
0017 #include <boost/compute/functional.hpp>
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/container/detail/scalar.hpp>
0020 #include <boost/compute/iterator/buffer_iterator.hpp>
0021 #include <boost/compute/type_traits/type_name.hpp>
0022 #include <boost/compute/detail/meta_kernel.hpp>
0023 #include <boost/compute/detail/iterator_range_size.hpp>
0024 #include <boost/compute/detail/parameter_cache.hpp>
0025
0026 namespace boost {
0027 namespace compute {
0028 namespace detail {
0029
0030 template<class InputIterator, class UnaryPredicate>
0031 inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
0032 InputIterator last,
0033 UnaryPredicate predicate,
0034 const size_t count,
0035 command_queue &queue)
0036 {
0037 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0038 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0039
0040 const context &context = queue.get_context();
0041
0042 detail::meta_kernel k("find_if");
0043 size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
0044 atomic_min<uint_> atomic_min_uint;
0045
0046 k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
0047 << k.decl<const value_type>("value") << "="
0048 << first[k.var<const uint_>("i")] << ";\n"
0049 << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
0050 << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
0051 << "}\n";
0052
0053 kernel kernel = k.compile(context);
0054
0055 scalar<uint_> index(context);
0056 kernel.set_arg(index_arg, index.get_buffer());
0057
0058
0059 index.write(static_cast<uint_>(count), queue);
0060 queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
0061
0062
0063 return first + static_cast<difference_type>(index.read(queue));
0064 }
0065
0066 template<class InputIterator, class UnaryPredicate>
0067 inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
0068 InputIterator last,
0069 UnaryPredicate predicate,
0070 const size_t count,
0071 const size_t vpt,
0072 command_queue &queue)
0073 {
0074 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0075 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0076
0077 const context &context = queue.get_context();
0078 const device &device = queue.get_device();
0079
0080 detail::meta_kernel k("find_if");
0081 size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
0082 size_t count_arg = k.add_arg<const uint_>("count");
0083 size_t vpt_arg = k.add_arg<const uint_>("vpt");
0084 atomic_min<uint_> atomic_min_uint;
0085
0086
0087 if(device.type() & device::gpu) {
0088 k <<
0089 k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
0090 k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
0091 k.decl<const uint_>("end") << " = min(" <<
0092 "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
0093 "count" <<
0094 ");\n" <<
0095
0096
0097 "__local uint local_index;\n" <<
0098 "if(get_local_id(0) == 0){\n" <<
0099 " local_index = *index;\n " <<
0100 "};\n" <<
0101 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0102 "if(local_index < id){\n" <<
0103 " return;\n" <<
0104 "}\n" <<
0105
0106 "while(id < end){\n" <<
0107 " " << k.decl<const value_type>("value") << " = " <<
0108 first[k.var<const uint_>("id")] << ";\n"
0109 " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
0110 " " << atomic_min_uint(k.var<uint_ *>("index"),
0111 k.var<uint_>("id")) << ";\n" <<
0112 " return;\n"
0113 " }\n" <<
0114 " id+=lsize;\n" <<
0115 "}\n";
0116
0117
0118 } else {
0119 k <<
0120 k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
0121 k.decl<const uint_>("end") << " = min(" <<
0122 "id + " << k.var<uint_>("vpt") << "," <<
0123 "count" <<
0124 ");\n" <<
0125 "while(id < end && (*index) > id){\n" <<
0126 " " << k.decl<const value_type>("value") << " = " <<
0127 first[k.var<const uint_>("id")] << ";\n"
0128 " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
0129 " " << atomic_min_uint(k.var<uint_ *>("index"),
0130 k.var<uint_>("id")) << ";\n" <<
0131 " return;\n" <<
0132 " }\n" <<
0133 " id++;\n" <<
0134 "}\n";
0135 }
0136
0137 kernel kernel = k.compile(context);
0138
0139 scalar<uint_> index(context);
0140 kernel.set_arg(index_arg, index.get_buffer());
0141 kernel.set_arg(count_arg, static_cast<uint_>(count));
0142 kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
0143
0144
0145 index.write(static_cast<uint_>(count), queue);
0146
0147 const size_t global_wg_size = static_cast<size_t>(
0148 std::ceil(float(count) / vpt)
0149 );
0150 queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
0151
0152
0153 return first + static_cast<difference_type>(index.read(queue));
0154 }
0155
0156
0157 template<class InputIterator, class UnaryPredicate>
0158 inline InputIterator find_if_with_atomics(InputIterator first,
0159 InputIterator last,
0160 UnaryPredicate predicate,
0161 command_queue &queue)
0162 {
0163 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0164
0165 size_t count = detail::iterator_range_size(first, last);
0166 if(count == 0){
0167 return last;
0168 }
0169
0170 const device &device = queue.get_device();
0171
0172
0173 std::string cache_key = std::string("__boost_find_if_with_atomics_")
0174 + type_name<value_type>();
0175 boost::shared_ptr<parameter_cache> parameters =
0176 detail::parameter_cache::get_global_cache(device);
0177
0178
0179
0180 if(device.type() & device::gpu){
0181 const size_t one_vpt_threshold =
0182 parameters->get(cache_key, "one_vpt_threshold", 1048576);
0183 if(count <= one_vpt_threshold){
0184 return find_if_with_atomics_one_vpt(
0185 first, last, predicate, count, queue
0186 );
0187 }
0188 }
0189
0190
0191 size_t vpt;
0192 if(device.type() & device::gpu){
0193
0194 vpt = parameters->get(cache_key, "vpt", 32);
0195 } else {
0196
0197 const size_t max_compute_units =
0198 device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0199 vpt = static_cast<size_t>(
0200 std::ceil(float(count) / max_compute_units)
0201 );
0202 }
0203
0204 return find_if_with_atomics_multiple_vpt(
0205 first, last, predicate, count, vpt, queue
0206 );
0207 }
0208
0209 }
0210 }
0211 }
0212
0213 #endif