![]() |
|
|||
File indexing completed on 2025-09-18 09:09:38
0001 //------------------------------- -*- C++ -*- -------------------------------// 0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details 0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT) 0004 //---------------------------------------------------------------------------// 0005 //! \file corecel/data/StackAllocator.hh 0006 //---------------------------------------------------------------------------// 0007 #pragma once 0008 0009 #include <new> 0010 0011 #include "corecel/math/Atomics.hh" 0012 0013 #include "StackAllocatorData.hh" 0014 0015 namespace celeritas 0016 { 0017 //---------------------------------------------------------------------------// 0018 /*! 0019 * Dynamically allocate arbitrary data on a stack. 0020 * 0021 * The stack allocator view acts as a functor and accessor to the allocated 0022 * data. It enables very fast on-device dynamic allocation of data, such as 0023 * secondaries or detector hits. As an example, inside a hypothetical physics 0024 * Interactor class, you could create two particles with the following code: 0025 * \code 0026 0027 struct Interactor 0028 { 0029 StackAllocator<Secondary> allocate; 0030 0031 // Sample an interaction 0032 template<class Engine> 0033 Interaction operator()(Engine&) 0034 { 0035 // Create 2 secondary particles 0036 Secondary* allocated = this->allocate(2); 0037 if (!allocated) 0038 { 0039 return Interaction::from_failure(); 0040 } 0041 Interaction result; 0042 result.secondaries = Span<Secondary>{allocated, 2}; 0043 return result; 0044 }; 0045 }; 0046 \endcode 0047 * 0048 * A later kernel could then iterate over the secondaries to apply cutoffs: 0049 * \code 0050 using SecondaryRef 0051 = StackAllocatorData<Secondary, Ownership::reference, MemSpace::device>; 0052 0053 __global__ apply_cutoff(const SecondaryRef ptrs) 0054 { 0055 auto thread_idx = celeritas::KernelParamCalculator::thread_id().get(); 0056 StackAllocator<Secondary> allocate(ptrs); 0057 auto secondaries = allocate.get(); 0058 if (thread_idx < secondaries.size()) 0059 { 0060 Secondary& sec = secondaries[thread_idx]; 0061 if (sec.energy < 100 * units::kilo_electron_volts) 0062 { 0063 sec.energy = 0; 0064 } 0065 } 0066 } 0067 * \endcode 0068 * 0069 * You *cannot* safely access the current size of the stack in the same kernel 0070 * that's modifying it -- if the stack attempts to allocate beyond the end, 0071 * then the \c size() call will reflect that overflowed state, rather than the 0072 * corrected size reflecting the failed allocation. 0073 * 0074 * A third kernel with a single thread would then be responsible for clearing 0075 * the data: 0076 * \code 0077 __global__ clear_stack(const SecondaryRef ptrs) 0078 { 0079 StackAllocator<Secondary> allocate(ptrs); 0080 auto thread_idx = celeritas::KernelParamCalculator::thread_id().get(); 0081 if (thread_idx == 0) 0082 { 0083 allocate.clear(); 0084 } 0085 } 0086 * \endcode 0087 * 0088 * These separate kernel launches are needed as grid-level synchronization 0089 * points. 0090 * 0091 * \todo Instead of returning a pointer, return IdRange<T>. Rename 0092 * StackAllocatorData to StackAllocation and have it look like a collection so 0093 * that *it* will provide access to the data. Better yet, have a 0094 * StackAllocation that can be a `const_reference` to the StackAllocatorData. 0095 * Then the rule will be "you can't create a StackAllocator in the same kernel 0096 * that you directly access a StackAllocation". 0097 */ 0098 template<class T> 0099 class StackAllocator 0100 { 0101 public: 0102 //!@{ 0103 //! \name Type aliases 0104 using value_type = T; 0105 using result_type = value_type*; 0106 using Data = StackAllocatorData<T, Ownership::reference, MemSpace::native>; 0107 //!@} 0108 0109 public: 0110 // Construct with shared data 0111 explicit inline CELER_FUNCTION StackAllocator(Data const& data); 0112 0113 // Total storage capacity (always safe) 0114 inline CELER_FUNCTION size_type capacity() const; 0115 0116 //// INITIALIZE //// 0117 0118 // Reset storage 0119 inline CELER_FUNCTION void clear(); 0120 0121 //// ALLOCATE //// 0122 0123 // Allocate space for this many data 0124 inline CELER_FUNCTION result_type operator()(size_type count); 0125 0126 //// ACCESS //// 0127 0128 // Current size 0129 inline CELER_FUNCTION size_type size() const; 0130 0131 // View all allocated data 0132 inline CELER_FUNCTION Span<value_type> get(); 0133 inline CELER_FUNCTION Span<value_type const> get() const; 0134 0135 private: 0136 Data const& data_; 0137 0138 //// HELPER FUNCTIONS //// 0139 0140 using SizeId = ItemId<size_type>; 0141 using StorageId = ItemId<T>; 0142 static CELER_CONSTEXPR_FUNCTION SizeId size_id() { return SizeId{0}; } 0143 }; 0144 0145 //---------------------------------------------------------------------------// 0146 // INLINE DEFINITIONS 0147 //---------------------------------------------------------------------------// 0148 /*! 0149 * Construct with defaults. 0150 */ 0151 template<class T> 0152 CELER_FUNCTION StackAllocator<T>::StackAllocator(Data const& shared) 0153 : data_(shared) 0154 { 0155 CELER_EXPECT(shared); 0156 } 0157 0158 //---------------------------------------------------------------------------// 0159 /*! 0160 * Get the maximum number of values that can be allocated. 0161 */ 0162 template<class T> 0163 CELER_FUNCTION auto StackAllocator<T>::capacity() const -> size_type 0164 { 0165 return data_.storage.size(); 0166 } 0167 0168 //---------------------------------------------------------------------------// 0169 /*! 0170 * Clear the stack allocator. 0171 * 0172 * This sets the size to zero. It should ideally *only* be called by a single 0173 * thread (though multiple threads resetting it should also be OK), but 0174 * *cannot be used in the same kernel that is allocating or viewing it*. This 0175 * is because the access times between different threads or thread-blocks is 0176 * indeterminate inside of a single kernel. 0177 */ 0178 template<class T> 0179 CELER_FUNCTION void StackAllocator<T>::clear() 0180 { 0181 data_.size[this->size_id()] = 0; 0182 } 0183 0184 //---------------------------------------------------------------------------// 0185 /*! 0186 * Allocate space for a given number of items. 0187 * 0188 * Returns NULL if allocation failed due to out-of-memory. Ensures that the 0189 * shared size reflects the amount of data allocated. 0190 */ 0191 template<class T> 0192 CELER_FUNCTION auto 0193 StackAllocator<T>::operator()(size_type count) -> result_type 0194 { 0195 CELER_EXPECT(count > 0); 0196 0197 // Atomic add 'count' to the shared size 0198 size_type start = atomic_add(&data_.size[this->size_id()], count); 0199 if (CELER_UNLIKELY(start + count > data_.storage.size())) 0200 { 0201 // Out of memory: restore the old value so that another thread can 0202 // potentially use it. Multiple threads are likely to exceed the 0203 // capacity simultaneously. Only one has a "start" value less than or 0204 // equal to the total capacity: the remainder are (arbitrarily) higher 0205 // than that. 0206 if (start <= this->capacity()) 0207 { 0208 // We were the first thread to exceed capacity, even though other 0209 // threads might have failed (and might still be failing) to 0210 // allocate. Restore the actual allocated size to the start value. 0211 // This might allow another thread with a smaller allocation to 0212 // succeed, but it also guarantees that at the end of the kernel, 0213 // the size reflects the actual capacity. 0214 data_.size[this->size_id()] = start; 0215 } 0216 0217 /*! 0218 * \todo It might be useful to set an "out of memory" flag to make it 0219 * easier for host code to detect whether a failure occurred, rather 0220 * than looping through primaries and testing for failure. 0221 */ 0222 0223 // Return null pointer, indicating failure to allocate. 0224 return nullptr; 0225 } 0226 0227 // Initialize the data at the newly "allocated" address 0228 value_type* result = new (&data_.storage[StorageId{start}]) value_type; 0229 for (size_type i = 1; i < count; ++i) 0230 { 0231 // Initialize remaining values 0232 CELER_ASSERT(&data_.storage[StorageId{start + i}] == result + i); 0233 new (&data_.storage[StorageId{start + i}]) value_type; 0234 } 0235 return result; 0236 } 0237 0238 //---------------------------------------------------------------------------// 0239 /*! 0240 * Get the number of items currently present. 0241 * 0242 * This value may not be meaningful (may be less than "actual" size) if 0243 * called in the same kernel as other threads that are allocating. 0244 */ 0245 template<class T> 0246 CELER_FUNCTION auto StackAllocator<T>::size() const -> size_type 0247 { 0248 size_type result = data_.size[this->size_id()]; 0249 CELER_ENSURE(result <= this->capacity()); 0250 return result; 0251 } 0252 0253 //---------------------------------------------------------------------------// 0254 /*! 0255 * View all allocated data. 0256 * 0257 * This cannot be called while any running kernel could be modifiying the size. 0258 */ 0259 template<class T> 0260 CELER_FUNCTION auto StackAllocator<T>::get() -> Span<value_type> 0261 { 0262 return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}]; 0263 } 0264 0265 //---------------------------------------------------------------------------// 0266 /*! 0267 * View all allocated data (const). 0268 * 0269 * This cannot be called while any running kernel could be modifiying the size. 0270 */ 0271 template<class T> 0272 CELER_FUNCTION auto StackAllocator<T>::get() const -> Span<value_type const> 0273 { 0274 return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}]; 0275 } 0276 0277 //---------------------------------------------------------------------------// 0278 } // namespace celeritas
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |