algorithm/detail/find_if_with_atomics.hpp

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/compute/types.hpp>
0017 #include <boost/compute/functional.hpp>
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/container/detail/scalar.hpp>
0020 #include <boost/compute/iterator/buffer_iterator.hpp>
0021 #include <boost/compute/type_traits/type_name.hpp>
0022 #include <boost/compute/detail/meta_kernel.hpp>
0023 #include <boost/compute/detail/iterator_range_size.hpp>
0024 #include <boost/compute/detail/parameter_cache.hpp>
0025
0026 namespace boost {
0027 namespace compute {
0028 namespace detail {
0029
0030 template<class InputIterator, class UnaryPredicate>
0031 inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
0032                                                   InputIterator last,
0033                                                   UnaryPredicate predicate,
0034                                                   const size_t count,
0035                                                   command_queue &queue)
0036 {
0037     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0038     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0039
0040     const context &context = queue.get_context();
0041
0042     detail::meta_kernel k("find_if");
0043     size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
0044     atomic_min<uint_> atomic_min_uint;
0045
0046     k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
0047       << k.decl<const value_type>("value") << "="
0048       <<     first[k.var<const uint_>("i")] << ";\n"
0049       << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
0050       << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
0051       << "}\n";
0052
0053     kernel kernel = k.compile(context);
0054
0055     scalar<uint_> index(context);
0056     kernel.set_arg(index_arg, index.get_buffer());
0057
0058     // initialize index to the last iterator's index
0059     index.write(static_cast<uint_>(count), queue);
0060     queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
0061
0062     // read index and return iterator
0063     return first + static_cast<difference_type>(index.read(queue));
0064 }
0065
0066 template<class InputIterator, class UnaryPredicate>
0067 inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
0068                                                        InputIterator last,
0069                                                        UnaryPredicate predicate,
0070                                                        const size_t count,
0071                                                        const size_t vpt,
0072                                                        command_queue &queue)
0073 {
0074     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0075     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
0076
0077     const context &context = queue.get_context();
0078     const device &device = queue.get_device();
0079
0080     detail::meta_kernel k("find_if");
0081     size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
0082     size_t count_arg = k.add_arg<const uint_>("count");
0083     size_t vpt_arg = k.add_arg<const uint_>("vpt");
0084     atomic_min<uint_> atomic_min_uint;
0085
0086     // for GPUs reads from global memory are coalesced
0087     if(device.type() & device::gpu) {
0088         k <<
0089             k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
0090             k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
0091             k.decl<const uint_>("end") << " = min(" <<
0092                     "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
0093                     "count" <<
0094             ");\n" <<
0095
0096             // checking if the index is already found
0097             "__local uint local_index;\n" <<
0098             "if(get_local_id(0) == 0){\n" <<
0099             "    local_index = *index;\n " <<
0100             "};\n" <<
0101             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
0102             "if(local_index < id){\n" <<
0103             "    return;\n" <<
0104             "}\n" <<
0105
0106             "while(id < end){\n" <<
0107             "    " << k.decl<const value_type>("value") << " = " <<
0108                       first[k.var<const uint_>("id")] << ";\n"
0109             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
0110             "        " << atomic_min_uint(k.var<uint_ *>("index"),
0111                                           k.var<uint_>("id")) << ";\n" <<
0112             "        return;\n"
0113             "    }\n" <<
0114             "    id+=lsize;\n" <<
0115             "}\n";
0116     // for CPUs (and other devices) reads are ordered so the big cache is
0117     // efficiently used.
0118     } else {
0119         k <<
0120             k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
0121             k.decl<const uint_>("end") << " = min(" <<
0122                     "id + " << k.var<uint_>("vpt") << "," <<
0123                     "count" <<
0124             ");\n" <<
0125             "while(id < end && (*index) > id){\n" <<
0126             "    " << k.decl<const value_type>("value") << " = " <<
0127                       first[k.var<const uint_>("id")] << ";\n"
0128             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
0129             "        " << atomic_min_uint(k.var<uint_ *>("index"),
0130                                           k.var<uint_>("id")) << ";\n" <<
0131             "        return;\n" <<
0132             "    }\n" <<
0133             "    id++;\n" <<
0134             "}\n";
0135     }
0136
0137     kernel kernel = k.compile(context);
0138
0139     scalar<uint_> index(context);
0140     kernel.set_arg(index_arg, index.get_buffer());
0141     kernel.set_arg(count_arg, static_cast<uint_>(count));
0142     kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
0143
0144     // initialize index to the last iterator's index
0145     index.write(static_cast<uint_>(count), queue);
0146
0147     const size_t global_wg_size = static_cast<size_t>(
0148         std::ceil(float(count) / vpt)
0149     );
0150     queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
0151
0152     // read index and return iterator
0153     return first + static_cast<difference_type>(index.read(queue));
0154 }
0155
0156 // Space complexity: O(1)
0157 template<class InputIterator, class UnaryPredicate>
0158 inline InputIterator find_if_with_atomics(InputIterator first,
0159                                           InputIterator last,
0160                                           UnaryPredicate predicate,
0161                                           command_queue &queue)
0162 {
0163     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
0164
0165     size_t count = detail::iterator_range_size(first, last);
0166     if(count == 0){
0167         return last;
0168     }
0169
0170     const device &device = queue.get_device();
0171
0172     // load cached parameters
0173     std::string cache_key = std::string("__boost_find_if_with_atomics_")
0174         + type_name<value_type>();
0175     boost::shared_ptr<parameter_cache> parameters =
0176         detail::parameter_cache::get_global_cache(device);
0177
0178     // for relatively small inputs on GPUs kernel checking one value per thread
0179     // (work-item) is more efficient than its multiple values per thread version
0180     if(device.type() & device::gpu){
0181         const size_t one_vpt_threshold =
0182             parameters->get(cache_key, "one_vpt_threshold", 1048576);
0183         if(count <= one_vpt_threshold){
0184             return find_if_with_atomics_one_vpt(
0185                 first, last, predicate, count, queue
0186             );
0187         }
0188     }
0189
0190     // values per thread
0191     size_t vpt;
0192     if(device.type() & device::gpu){
0193         // get vpt parameter
0194         vpt = parameters->get(cache_key, "vpt", 32);
0195     } else {
0196         // for CPUs work is split equally between compute units
0197         const size_t max_compute_units =
0198             device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
0199         vpt = static_cast<size_t>(
0200             std::ceil(float(count) / max_compute_units)
0201         );
0202     }
0203
0204     return find_if_with_atomics_multiple_vpt(
0205         first, last, predicate, count, vpt, queue
0206     );
0207 }
0208
0209 } // end detail namespace
0210 } // end compute namespace
0211 } // end boost namespace
0212
0213 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP