Warning, file /include/oneapi/tbb/parallel_scan.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017 #ifndef __TBB_parallel_scan_H
0018 #define __TBB_parallel_scan_H
0019
0020 #include <functional>
0021
0022 #include "detail/_config.h"
0023 #include "detail/_namespace_injection.h"
0024 #include "detail/_exception.h"
0025 #include "detail/_task.h"
0026
0027 #include "profiling.h"
0028 #include "partitioner.h"
0029 #include "blocked_range.h"
0030 #include "task_group.h"
0031
0032 namespace tbb {
0033 namespace detail {
0034 namespace d1 {
0035
0036
0037
0038 struct pre_scan_tag {
0039 static bool is_final_scan() {return false;}
0040 operator bool() {return is_final_scan();}
0041 };
0042
0043
0044
0045 struct final_scan_tag {
0046 static bool is_final_scan() {return true;}
0047 operator bool() {return is_final_scan();}
0048 };
0049
0050 template<typename Range, typename Body>
0051 struct sum_node;
0052
0053 #if __TBB_CPP20_CONCEPTS_PRESENT
0054 }
0055 namespace d0 {
0056
0057 template <typename Body, typename Range>
0058 concept parallel_scan_body = splittable<Body> &&
0059 requires( Body& body, const Range& range, Body& other ) {
0060 body(range, tbb::detail::d1::pre_scan_tag{});
0061 body(range, tbb::detail::d1::final_scan_tag{});
0062 body.reverse_join(other);
0063 body.assign(other);
0064 };
0065
0066 template <typename Function, typename Range, typename Value>
0067 concept parallel_scan_function = std::invocable<const std::remove_reference_t<Function>&,
0068 const Range&, const Value&, bool> &&
0069 std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
0070 const Range&, const Value&, bool>,
0071 Value>;
0072
0073 template <typename Combine, typename Value>
0074 concept parallel_scan_combine = std::invocable<const std::remove_reference_t<Combine>&,
0075 const Value&, const Value&> &&
0076 std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
0077 const Value&, const Value&>,
0078 Value>;
0079
0080 }
0081 namespace d1 {
0082 #endif
0083
0084
0085
0086 template<typename Range, typename Body>
0087 struct final_sum : public task {
0088 private:
0089 using sum_node_type = sum_node<Range, Body>;
0090 Body m_body;
0091 aligned_space<Range> m_range;
0092
0093 Body* m_stuff_last;
0094
0095 wait_context& m_wait_context;
0096 sum_node_type* m_parent = nullptr;
0097 public:
0098 small_object_allocator m_allocator;
0099 final_sum( Body& body, wait_context& w_o, small_object_allocator& alloc ) :
0100 m_body(body, split()), m_wait_context(w_o), m_allocator(alloc) {
0101 poison_pointer(m_stuff_last);
0102 }
0103
0104 final_sum( final_sum& sum, small_object_allocator& alloc ) :
0105 m_body(sum.m_body, split()), m_wait_context(sum.m_wait_context), m_allocator(alloc) {
0106 poison_pointer(m_stuff_last);
0107 }
0108
0109 ~final_sum() {
0110 m_range.begin()->~Range();
0111 }
0112 void finish_construction( sum_node_type* parent, const Range& range, Body* stuff_last ) {
0113 __TBB_ASSERT( m_parent == nullptr, nullptr );
0114 m_parent = parent;
0115 new( m_range.begin() ) Range(range);
0116 m_stuff_last = stuff_last;
0117 }
0118 private:
0119 sum_node_type* release_parent() {
0120 call_itt_task_notify(releasing, m_parent);
0121 if (m_parent) {
0122 auto parent = m_parent;
0123 m_parent = nullptr;
0124 if (parent->ref_count.fetch_sub(1) == 1) {
0125 return parent;
0126 }
0127 }
0128 else
0129 m_wait_context.release();
0130 return nullptr;
0131 }
0132 sum_node_type* finalize(const execution_data& ed){
0133 sum_node_type* next_task = release_parent();
0134 m_allocator.delete_object<final_sum>(this, ed);
0135 return next_task;
0136 }
0137
0138 public:
0139 task* execute(execution_data& ed) override {
0140 m_body( *m_range.begin(), final_scan_tag() );
0141 if( m_stuff_last )
0142 m_stuff_last->assign(m_body);
0143
0144 return finalize(ed);
0145 }
0146 task* cancel(execution_data& ed) override {
0147 return finalize(ed);
0148 }
0149 template<typename Tag>
0150 void operator()( const Range& r, Tag tag ) {
0151 m_body( r, tag );
0152 }
0153 void reverse_join( final_sum& a ) {
0154 m_body.reverse_join(a.m_body);
0155 }
0156 void reverse_join( Body& body ) {
0157 m_body.reverse_join(body);
0158 }
0159 void assign_to( Body& body ) {
0160 body.assign(m_body);
0161 }
0162 void self_destroy(const execution_data& ed) {
0163 m_allocator.delete_object<final_sum>(this, ed);
0164 }
0165 };
0166
0167
0168
0169 template<typename Range, typename Body>
0170 struct sum_node : public task {
0171 private:
0172 using final_sum_type = final_sum<Range,Body>;
0173 public:
0174 final_sum_type *m_incoming;
0175 final_sum_type *m_body;
0176 Body *m_stuff_last;
0177 private:
0178 final_sum_type *m_left_sum;
0179 sum_node *m_left;
0180 sum_node *m_right;
0181 bool m_left_is_final;
0182 Range m_range;
0183 wait_context& m_wait_context;
0184 sum_node* m_parent;
0185 small_object_allocator m_allocator;
0186 public:
0187 std::atomic<unsigned int> ref_count{0};
0188 sum_node( const Range range, bool left_is_final_, sum_node* parent, wait_context& w_o, small_object_allocator& alloc ) :
0189 m_stuff_last(nullptr),
0190 m_left_sum(nullptr),
0191 m_left(nullptr),
0192 m_right(nullptr),
0193 m_left_is_final(left_is_final_),
0194 m_range(range),
0195 m_wait_context(w_o),
0196 m_parent(parent),
0197 m_allocator(alloc)
0198 {
0199 if( m_parent )
0200 m_parent->ref_count.fetch_add(1);
0201
0202 poison_pointer(m_body);
0203 poison_pointer(m_incoming);
0204 }
0205
0206 ~sum_node() {
0207 if (m_parent)
0208 m_parent->ref_count.fetch_sub(1);
0209 }
0210 private:
0211 sum_node* release_parent() {
0212 call_itt_task_notify(releasing, m_parent);
0213 if (m_parent) {
0214 auto parent = m_parent;
0215 m_parent = nullptr;
0216 if (parent->ref_count.fetch_sub(1) == 1) {
0217 return parent;
0218 }
0219 }
0220 else
0221 m_wait_context.release();
0222 return nullptr;
0223 }
0224 task* create_child( const Range& range, final_sum_type& body, sum_node* child, final_sum_type* incoming, Body* stuff_last ) {
0225 if( child ) {
0226 __TBB_ASSERT( is_poisoned(child->m_body) && is_poisoned(child->m_incoming), nullptr );
0227 child->prepare_for_execution(body, incoming, stuff_last);
0228 return child;
0229 } else {
0230 body.finish_construction(this, range, stuff_last);
0231 return &body;
0232 }
0233 }
0234
0235 sum_node* finalize(const execution_data& ed) {
0236 sum_node* next_task = release_parent();
0237 m_allocator.delete_object<sum_node>(this, ed);
0238 return next_task;
0239 }
0240
0241 public:
0242 void prepare_for_execution(final_sum_type& body, final_sum_type* incoming, Body *stuff_last) {
0243 this->m_body = &body;
0244 this->m_incoming = incoming;
0245 this->m_stuff_last = stuff_last;
0246 }
0247 task* execute(execution_data& ed) override {
0248 if( m_body ) {
0249 if( m_incoming )
0250 m_left_sum->reverse_join( *m_incoming );
0251 task* right_child = this->create_child(Range(m_range,split()), *m_left_sum, m_right, m_left_sum, m_stuff_last);
0252 task* left_child = m_left_is_final ? nullptr : this->create_child(m_range, *m_body, m_left, m_incoming, nullptr);
0253 ref_count = (left_child != nullptr) + (right_child != nullptr);
0254 m_body = nullptr;
0255 if( left_child ) {
0256 spawn(*right_child, *ed.context);
0257 return left_child;
0258 } else {
0259 return right_child;
0260 }
0261 } else {
0262 return finalize(ed);
0263 }
0264 }
0265 task* cancel(execution_data& ed) override {
0266 return finalize(ed);
0267 }
0268 void self_destroy(const execution_data& ed) {
0269 m_allocator.delete_object<sum_node>(this, ed);
0270 }
0271 template<typename range,typename body,typename partitioner>
0272 friend struct start_scan;
0273
0274 template<typename range,typename body>
0275 friend struct finish_scan;
0276 };
0277
0278
0279
0280 template<typename Range, typename Body>
0281 struct finish_scan : public task {
0282 private:
0283 using sum_node_type = sum_node<Range,Body>;
0284 using final_sum_type = final_sum<Range,Body>;
0285 final_sum_type** const m_sum_slot;
0286 sum_node_type*& m_return_slot;
0287 small_object_allocator m_allocator;
0288 public:
0289 std::atomic<final_sum_type*> m_right_zombie;
0290 sum_node_type& m_result;
0291 std::atomic<unsigned int> ref_count{2};
0292 finish_scan* m_parent;
0293 wait_context& m_wait_context;
0294 task* execute(execution_data& ed) override {
0295 __TBB_ASSERT( m_result.ref_count.load() == static_cast<unsigned int>((m_result.m_left!=nullptr)+(m_result.m_right!=nullptr)), nullptr );
0296 if( m_result.m_left )
0297 m_result.m_left_is_final = false;
0298 final_sum_type* right_zombie = m_right_zombie.load(std::memory_order_acquire);
0299 if( right_zombie && m_sum_slot )
0300 (*m_sum_slot)->reverse_join(*m_result.m_left_sum);
0301 __TBB_ASSERT( !m_return_slot, nullptr );
0302 if( right_zombie || m_result.m_right ) {
0303 m_return_slot = &m_result;
0304 } else {
0305 m_result.self_destroy(ed);
0306 }
0307 if( right_zombie && !m_sum_slot && !m_result.m_right ) {
0308 right_zombie->self_destroy(ed);
0309 m_right_zombie.store(nullptr, std::memory_order_relaxed);
0310 }
0311 return finalize(ed);
0312 }
0313 task* cancel(execution_data& ed) override {
0314 return finalize(ed);
0315 }
0316 finish_scan(sum_node_type*& return_slot, final_sum_type** sum, sum_node_type& result_, finish_scan* parent, wait_context& w_o, small_object_allocator& alloc) :
0317 m_sum_slot(sum),
0318 m_return_slot(return_slot),
0319 m_allocator(alloc),
0320 m_right_zombie(nullptr),
0321 m_result(result_),
0322 m_parent(parent),
0323 m_wait_context(w_o)
0324 {
0325 __TBB_ASSERT( !m_return_slot, nullptr );
0326 }
0327 private:
0328 finish_scan* release_parent() {
0329 call_itt_task_notify(releasing, m_parent);
0330 if (m_parent) {
0331 auto parent = m_parent;
0332 m_parent = nullptr;
0333 if (parent->ref_count.fetch_sub(1) == 1) {
0334 return parent;
0335 }
0336 }
0337 else
0338 m_wait_context.release();
0339 return nullptr;
0340 }
0341 finish_scan* finalize(const execution_data& ed) {
0342 finish_scan* next_task = release_parent();
0343 m_allocator.delete_object<finish_scan>(this, ed);
0344 return next_task;
0345 }
0346 };
0347
0348
0349
0350 template<typename Range, typename Body, typename Partitioner>
0351 struct start_scan : public task {
0352 private:
0353 using sum_node_type = sum_node<Range,Body>;
0354 using final_sum_type = final_sum<Range,Body>;
0355 using finish_pass1_type = finish_scan<Range,Body>;
0356 std::reference_wrapper<sum_node_type*> m_return_slot;
0357 Range m_range;
0358 std::reference_wrapper<final_sum_type> m_body;
0359 typename Partitioner::partition_type m_partition;
0360
0361 final_sum_type** m_sum_slot;
0362 bool m_is_final;
0363 bool m_is_right_child;
0364
0365 finish_pass1_type* m_parent;
0366 small_object_allocator m_allocator;
0367 wait_context& m_wait_context;
0368
0369 finish_pass1_type* release_parent() {
0370 call_itt_task_notify(releasing, m_parent);
0371 if (m_parent) {
0372 auto parent = m_parent;
0373 m_parent = nullptr;
0374 if (parent->ref_count.fetch_sub(1) == 1) {
0375 return parent;
0376 }
0377 }
0378 else
0379 m_wait_context.release();
0380 return nullptr;
0381 }
0382
0383 finish_pass1_type* finalize( const execution_data& ed ) {
0384 finish_pass1_type* next_task = release_parent();
0385 m_allocator.delete_object<start_scan>(this, ed);
0386 return next_task;
0387 }
0388
0389 public:
0390 task* execute( execution_data& ) override;
0391 task* cancel( execution_data& ed ) override {
0392 return finalize(ed);
0393 }
0394 start_scan( sum_node_type*& return_slot, start_scan& parent, small_object_allocator& alloc ) :
0395 m_return_slot(return_slot),
0396 m_range(parent.m_range,split()),
0397 m_body(parent.m_body),
0398 m_partition(parent.m_partition,split()),
0399 m_sum_slot(parent.m_sum_slot),
0400 m_is_final(parent.m_is_final),
0401 m_is_right_child(true),
0402 m_parent(parent.m_parent),
0403 m_allocator(alloc),
0404 m_wait_context(parent.m_wait_context)
0405 {
0406 __TBB_ASSERT( !m_return_slot, nullptr );
0407 parent.m_is_right_child = false;
0408 }
0409
0410 start_scan( sum_node_type*& return_slot, const Range& range, final_sum_type& body, const Partitioner& partitioner, wait_context& w_o, small_object_allocator& alloc ) :
0411 m_return_slot(return_slot),
0412 m_range(range),
0413 m_body(body),
0414 m_partition(partitioner),
0415 m_sum_slot(nullptr),
0416 m_is_final(true),
0417 m_is_right_child(false),
0418 m_parent(nullptr),
0419 m_allocator(alloc),
0420 m_wait_context(w_o)
0421 {
0422 __TBB_ASSERT( !m_return_slot, nullptr );
0423 }
0424
0425 static void run( const Range& range, Body& body, const Partitioner& partitioner ) {
0426 if( !range.empty() ) {
0427 task_group_context context(PARALLEL_SCAN);
0428
0429 using start_pass1_type = start_scan<Range,Body,Partitioner>;
0430 sum_node_type* root = nullptr;
0431 wait_context w_ctx{1};
0432 small_object_allocator alloc{};
0433
0434 auto& temp_body = *alloc.new_object<final_sum_type>(body, w_ctx, alloc);
0435 temp_body.reverse_join(body);
0436
0437 auto& pass1 = *alloc.new_object<start_pass1_type>(root, range, temp_body, partitioner, w_ctx, alloc);
0438
0439 execute_and_wait(pass1, context, w_ctx, context);
0440 if( root ) {
0441 root->prepare_for_execution(temp_body, nullptr, &body);
0442 w_ctx.reserve();
0443 execute_and_wait(*root, context, w_ctx, context);
0444 } else {
0445 temp_body.assign_to(body);
0446 temp_body.finish_construction(nullptr, range, nullptr);
0447 alloc.delete_object<final_sum_type>(&temp_body);
0448 }
0449 }
0450 }
0451 };
0452
0453 template<typename Range, typename Body, typename Partitioner>
0454 task* start_scan<Range,Body,Partitioner>::execute( execution_data& ed ) {
0455
0456
0457
0458 __TBB_ASSERT(!m_is_right_child || m_parent, "right child is never an orphan");
0459 bool treat_as_stolen = m_is_right_child && (is_stolen(ed) || &m_body.get()!=m_parent->m_result.m_left_sum);
0460 if( treat_as_stolen ) {
0461
0462 small_object_allocator alloc{};
0463 final_sum_type* right_zombie = alloc.new_object<final_sum_type>(m_body, alloc);
0464 m_parent->m_right_zombie.store(right_zombie, std::memory_order_release);
0465 m_body = *right_zombie;
0466 m_is_final = false;
0467 }
0468 task* next_task = nullptr;
0469 if( (m_is_right_child && !treat_as_stolen) || !m_range.is_divisible() || m_partition.should_execute_range(ed) ) {
0470 if( m_is_final )
0471 m_body(m_range, final_scan_tag());
0472 else if( m_sum_slot )
0473 m_body(m_range, pre_scan_tag());
0474 if( m_sum_slot )
0475 *m_sum_slot = &m_body.get();
0476 __TBB_ASSERT( !m_return_slot, nullptr );
0477
0478 next_task = finalize(ed);
0479 } else {
0480 small_object_allocator alloc{};
0481 auto result = alloc.new_object<sum_node_type>(m_range,m_is_final, m_parent? &m_parent->m_result: nullptr, m_wait_context, alloc);
0482
0483 auto new_parent = alloc.new_object<finish_pass1_type>(m_return_slot, m_sum_slot, *result, m_parent, m_wait_context, alloc);
0484 m_parent = new_parent;
0485
0486
0487 auto& right_child = *alloc.new_object<start_scan>(result->m_right, *this, alloc);
0488
0489 spawn(right_child, *ed.context);
0490
0491 m_sum_slot = &result->m_left_sum;
0492 m_return_slot = result->m_left;
0493
0494 __TBB_ASSERT( !m_return_slot, nullptr );
0495 next_task = this;
0496 }
0497 return next_task;
0498 }
0499
0500 template<typename Range, typename Value, typename Scan, typename ReverseJoin>
0501 class lambda_scan_body {
0502 Value m_sum_slot;
0503 const Value& identity_element;
0504 const Scan& m_scan;
0505 const ReverseJoin& m_reverse_join;
0506 public:
0507 void operator=(const lambda_scan_body&) = delete;
0508 lambda_scan_body(const lambda_scan_body&) = default;
0509
0510 lambda_scan_body( const Value& identity, const Scan& scan, const ReverseJoin& rev_join )
0511 : m_sum_slot(identity)
0512 , identity_element(identity)
0513 , m_scan(scan)
0514 , m_reverse_join(rev_join) {}
0515
0516 lambda_scan_body( lambda_scan_body& b, split )
0517 : m_sum_slot(b.identity_element)
0518 , identity_element(b.identity_element)
0519 , m_scan(b.m_scan)
0520 , m_reverse_join(b.m_reverse_join) {}
0521
0522 template<typename Tag>
0523 void operator()( const Range& r, Tag tag ) {
0524 m_sum_slot = tbb::detail::invoke(m_scan, r, m_sum_slot, tag);
0525 }
0526
0527 void reverse_join( lambda_scan_body& a ) {
0528 m_sum_slot = tbb::detail::invoke(m_reverse_join, a.m_sum_slot, m_sum_slot);
0529 }
0530
0531 void assign( lambda_scan_body& b ) {
0532 m_sum_slot = b.m_sum_slot;
0533 }
0534
0535 Value result() const {
0536 return m_sum_slot;
0537 }
0538 };
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558
0559
0560
0561
0562 template<typename Range, typename Body>
0563 __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
0564 void parallel_scan( const Range& range, Body& body ) {
0565 start_scan<Range, Body, auto_partitioner>::run(range,body,__TBB_DEFAULT_PARTITIONER());
0566 }
0567
0568
0569
0570 template<typename Range, typename Body>
0571 __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
0572 void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) {
0573 start_scan<Range, Body, simple_partitioner>::run(range, body, partitioner);
0574 }
0575
0576
0577
0578 template<typename Range, typename Body>
0579 __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
0580 void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) {
0581 start_scan<Range,Body,auto_partitioner>::run(range, body, partitioner);
0582 }
0583
0584
0585
0586 template<typename Range, typename Value, typename Scan, typename ReverseJoin>
0587 __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
0588 parallel_scan_combine<ReverseJoin, Value>)
0589 Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join ) {
0590 lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
0591 parallel_scan(range, body, __TBB_DEFAULT_PARTITIONER());
0592 return body.result();
0593 }
0594
0595
0596
0597 template<typename Range, typename Value, typename Scan, typename ReverseJoin>
0598 __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
0599 parallel_scan_combine<ReverseJoin, Value>)
0600 Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
0601 const simple_partitioner& partitioner ) {
0602 lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
0603 parallel_scan(range, body, partitioner);
0604 return body.result();
0605 }
0606
0607
0608
0609 template<typename Range, typename Value, typename Scan, typename ReverseJoin>
0610 __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
0611 parallel_scan_combine<ReverseJoin, Value>)
0612 Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
0613 const auto_partitioner& partitioner ) {
0614 lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
0615 parallel_scan(range, body, partitioner);
0616 return body.result();
0617 }
0618
0619 }
0620 }
0621
0622 inline namespace v1 {
0623 using detail::d1::parallel_scan;
0624 using detail::d1::pre_scan_tag;
0625 using detail::d1::final_scan_tag;
0626 }
0627
0628 }
0629
0630 #endif