algorithm/detail/count_if_with_threads.hpp

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
0013
0014 #include <numeric>
0015
0016 #include <boost/compute/detail/meta_kernel.hpp>
0017 #include <boost/compute/container/vector.hpp>
0018
0019 namespace boost {
0020 namespace compute {
0021 namespace detail {
0022
0023 template<class InputIterator, class Predicate>
0024 class count_if_with_threads_kernel : meta_kernel
0025 {
0026 public:
0027     typedef typename
0028         std::iterator_traits<InputIterator>::value_type
0029         value_type;
0030
0031     count_if_with_threads_kernel()
0032         : meta_kernel("count_if_with_threads")
0033     {
0034     }
0035
0036     void set_args(InputIterator first,
0037                   InputIterator last,
0038                   Predicate predicate)
0039
0040     {
0041         typedef typename std::iterator_traits<InputIterator>::value_type T;
0042
0043         m_size = detail::iterator_range_size(first, last);
0044
0045         m_size_arg = add_arg<const ulong_>("size");
0046         m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
0047
0048         *this <<
0049             // thread parameters
0050             "const uint gid = get_global_id(0);\n" <<
0051             "const uint block_size = size / get_global_size(0);\n" <<
0052             "const uint start = block_size * gid;\n" <<
0053             "uint end = 0;\n" <<
0054             "if(gid == get_global_size(0) - 1)\n" <<
0055             "    end = size;\n" <<
0056             "else\n" <<
0057             "    end = block_size * gid + block_size;\n" <<
0058
0059             // count values
0060             "uint count = 0;\n" <<
0061             "for(uint i = start; i < end; i++){\n" <<
0062                 decl<const T>("value") << "="
0063                     << first[expr<uint_>("i")] << ";\n" <<
0064                 if_(predicate(var<const T>("value"))) << "{\n" <<
0065                     "count++;\n" <<
0066                 "}\n" <<
0067             "}\n" <<
0068
0069             // write count
0070             "counts[gid] = count;\n";
0071     }
0072
0073     size_t exec(command_queue &queue)
0074     {
0075         const device &device = queue.get_device();
0076         const context &context = queue.get_context();
0077
0078         size_t threads = device.compute_units();
0079
0080         const size_t minimum_block_size = 2048;
0081         if(m_size / threads < minimum_block_size){
0082             threads = static_cast<size_t>(
0083                           (std::max)(
0084                               std::ceil(float(m_size) / minimum_block_size),
0085                               1.0f
0086                           )
0087                       );
0088         }
0089
0090         // storage for counts
0091         ::boost::compute::vector<ulong_> counts(threads, context);
0092
0093         // exec kernel
0094         set_arg(m_size_arg, static_cast<ulong_>(m_size));
0095         set_arg(m_counts_arg, counts.get_buffer());
0096         exec_1d(queue, 0, threads, 1);
0097
0098         // copy counts to the host
0099         std::vector<ulong_> host_counts(threads);
0100         ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
0101
0102         // return sum of counts
0103         return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
0104     }
0105
0106 private:
0107     size_t m_size;
0108     size_t m_size_arg;
0109     size_t m_counts_arg;
0110 };
0111
0112 // counts values that match the predicate using one thread per block. this is
0113 // optimized for cpu-type devices with a small number of compute units.
0114 template<class InputIterator, class Predicate>
0115 inline size_t count_if_with_threads(InputIterator first,
0116                                     InputIterator last,
0117                                     Predicate predicate,
0118                                     command_queue &queue)
0119 {
0120     count_if_with_threads_kernel<InputIterator, Predicate> kernel;
0121     kernel.set_args(first, last, predicate);
0122     return kernel.exec(queue);
0123 }
0124
0125 } // end detail namespace
0126 } // end compute namespace
0127 } // end boost namespace
0128
0129 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP