algorithm/detail/scan_on_cpu.hpp

0001 //---------------------------------------------------------------------------//
0002 // Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
0003 //
0004 // Distributed under the Boost Software License, Version 1.0
0005 // See accompanying file LICENSE_1_0.txt or copy at
0006 // http://www.boost.org/LICENSE_1_0.txt
0007 //
0008 // See http://boostorg.github.com/compute for more information.
0009 //---------------------------------------------------------------------------//
0010
0011 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
0012 #define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
0013
0014 #include <iterator>
0015
0016 #include <boost/compute/device.hpp>
0017 #include <boost/compute/kernel.hpp>
0018 #include <boost/compute/command_queue.hpp>
0019 #include <boost/compute/algorithm/detail/serial_scan.hpp>
0020 #include <boost/compute/detail/meta_kernel.hpp>
0021 #include <boost/compute/detail/iterator_range_size.hpp>
0022 #include <boost/compute/detail/parameter_cache.hpp>
0023
0024 namespace boost {
0025 namespace compute {
0026 namespace detail {
0027
0028 template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
0029 inline OutputIterator scan_on_cpu(InputIterator first,
0030                                   InputIterator last,
0031                                   OutputIterator result,
0032                                   bool exclusive,
0033                                   T init,
0034                                   BinaryOperator op,
0035                                   command_queue &queue)
0036 {
0037     typedef typename
0038         std::iterator_traits<InputIterator>::value_type input_type;
0039     typedef typename
0040         std::iterator_traits<OutputIterator>::value_type output_type;
0041
0042     const context &context = queue.get_context();
0043     const device &device = queue.get_device();
0044     const size_t compute_units = queue.get_device().compute_units();
0045
0046     boost::shared_ptr<parameter_cache> parameters =
0047         detail::parameter_cache::get_global_cache(device);
0048
0049     std::string cache_key =
0050         "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
0051
0052     // for inputs smaller than serial_scan_threshold
0053     // serial_scan algorithm is used
0054     uint_ serial_scan_threshold =
0055         parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
0056     serial_scan_threshold =
0057         (std::max)(serial_scan_threshold, uint_(compute_units));
0058
0059     size_t count = detail::iterator_range_size(first, last);
0060     if(count == 0){
0061         return result;
0062     }
0063     else if(count < serial_scan_threshold) {
0064         return serial_scan(first, last, result, exclusive, init, op, queue);
0065     }
0066
0067     buffer block_partial_sums(context, sizeof(output_type) * compute_units );
0068
0069     // create scan kernel
0070     meta_kernel k("scan_on_cpu_block_scan");
0071
0072     // Arguments
0073     size_t count_arg = k.add_arg<uint_>("count");
0074     size_t init_arg = k.add_arg<output_type>("initial_value");
0075     size_t block_partial_sums_arg =
0076         k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
0077
0078     k <<
0079         "uint block = (count + get_global_size(0))/(get_global_size(0) + 1);\n" <<
0080         "uint index = get_global_id(0) * block;\n" <<
0081         "uint end = min(count, index + block);\n" <<
0082         "if(index >= end) return;\n";
0083
0084     if(!exclusive){
0085         k <<
0086             k.decl<output_type>("sum") << " = " <<
0087                 first[k.var<uint_>("index")] << ";\n" <<
0088             result[k.var<uint_>("index")] << " = sum;\n" <<
0089             "index++;\n";
0090     }
0091     else {
0092         k <<
0093             k.decl<output_type>("sum") << ";\n" <<
0094             "if(index == 0){\n" <<
0095                 "sum = initial_value;\n" <<
0096             "}\n" <<
0097             "else {\n" <<
0098                 "sum = " << first[k.var<uint_>("index")] << ";\n" <<
0099                 "index++;\n" <<
0100             "}\n";
0101     }
0102
0103     k <<
0104         "while(index < end){\n" <<
0105             // load next value
0106             k.decl<const input_type>("value") << " = "
0107                 << first[k.var<uint_>("index")] << ";\n";
0108
0109     if(exclusive){
0110         k <<
0111             "if(get_global_id(0) == 0){\n" <<
0112                 result[k.var<uint_>("index")] << " = sum;\n" <<
0113             "}\n";
0114     }
0115     k <<
0116             "sum = " << op(k.var<output_type>("sum"),
0117                            k.var<output_type>("value")) << ";\n";
0118
0119     if(!exclusive){
0120         k <<
0121             "if(get_global_id(0) == 0){\n" <<
0122                 result[k.var<uint_>("index")] << " = sum;\n" <<
0123             "}\n";
0124     }
0125
0126     k <<
0127             "index++;\n" <<
0128         "}\n" << // end while
0129         "block_partial_sums[get_global_id(0)] = sum;\n";
0130
0131     // compile scan kernel
0132     kernel block_scan_kernel = k.compile(context);
0133
0134     // setup kernel arguments
0135     block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
0136     block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
0137     block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
0138
0139     // execute the kernel
0140     size_t global_work_size = compute_units;
0141     queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
0142
0143     // scan is done
0144     if(compute_units < 2) {
0145         return result + count;
0146     }
0147
0148     // final scan kernel
0149     meta_kernel l("scan_on_cpu_final_scan");
0150
0151     // Arguments
0152     count_arg = l.add_arg<uint_>("count");
0153     block_partial_sums_arg =
0154         l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
0155
0156     l <<
0157         "uint block = (count + get_global_size(0))/(get_global_size(0) + 1);\n" <<
0158         "uint index = block + get_global_id(0) * block;\n" <<
0159         "uint end = min(count, index + block);\n" <<
0160         k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
0161         "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
0162             "sum = " << op(k.var<output_type>("sum"),
0163                            k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
0164         "}\n" <<
0165
0166         "while(index < end){\n";
0167     if(exclusive){
0168         l <<
0169             l.decl<output_type>("value") << " = "
0170                 << first[k.var<uint_>("index")] << ";\n" <<
0171             result[k.var<uint_>("index")] << " = sum;\n" <<
0172             "sum = " << op(k.var<output_type>("sum"),
0173                            k.var<output_type>("value")) << ";\n";
0174     }
0175     else {
0176         l <<
0177             "sum = " << op(k.var<output_type>("sum"),
0178                            first[k.var<uint_>("index")]) << ";\n" <<
0179             result[k.var<uint_>("index")] << " = sum;\n";
0180     }
0181     l <<
0182             "index++;\n" <<
0183         "}\n";
0184
0185
0186     // compile scan kernel
0187     kernel final_scan_kernel = l.compile(context);
0188
0189     // setup kernel arguments
0190     final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
0191     final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
0192
0193     // execute the kernel
0194     global_work_size = compute_units;
0195     queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
0196
0197     // return iterator pointing to the end of the result range
0198     return result + count;
0199 }
0200
0201 } // end detail namespace
0202 } // end compute namespace
0203 } // end boost namespace
0204
0205 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP