|
||||
File indexing completed on 2025-01-18 09:54:47
0001 //----------------------------------*-C++-*----------------------------------// 0002 // Copyright 2020-2024 UT-Battelle, LLC, and other Celeritas developers. 0003 // See the top-level COPYRIGHT file for details. 0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT) 0005 //---------------------------------------------------------------------------// 0006 //! \file corecel/data/StackAllocator.hh 0007 //---------------------------------------------------------------------------// 0008 #pragma once 0009 0010 #include <new> 0011 0012 #include "corecel/math/Atomics.hh" 0013 0014 #include "StackAllocatorData.hh" 0015 0016 namespace celeritas 0017 { 0018 //---------------------------------------------------------------------------// 0019 /*! 0020 * Dynamically allocate arbitrary data on a stack. 0021 * 0022 * The stack allocator view acts as a functor and accessor to the allocated 0023 * data. It enables very fast on-device dynamic allocation of data, such as 0024 * secondaries or detector hits. As an example, inside a hypothetical physics 0025 * Interactor class, you could create two particles with the following code: 0026 * \code 0027 0028 struct Interactor 0029 { 0030 StackAllocator<Secondary> allocate; 0031 0032 // Sample an interaction 0033 template<class Engine> 0034 Interaction operator()(Engine&) 0035 { 0036 // Create 2 secondary particles 0037 Secondary* allocated = this->allocate(2); 0038 if (!allocated) 0039 { 0040 return Interaction::from_failure(); 0041 } 0042 Interaction result; 0043 result.secondaries = Span<Secondary>{allocated, 2}; 0044 return result; 0045 }; 0046 }; 0047 \endcode 0048 * 0049 * A later kernel could then iterate over the secondaries to apply cutoffs: 0050 * \code 0051 using SecondaryRef 0052 = StackAllocatorData<Secondary, Ownership::reference, MemSpace::device>; 0053 0054 __global__ apply_cutoff(const SecondaryRef ptrs) 0055 { 0056 auto thread_idx = celeritas::KernelParamCalculator::thread_id().get(); 0057 StackAllocator<Secondary> allocate(ptrs); 0058 auto secondaries = allocate.get(); 0059 if (thread_idx < secondaries.size()) 0060 { 0061 Secondary& sec = secondaries[thread_idx]; 0062 if (sec.energy < 100 * units::kilo_electron_volts) 0063 { 0064 sec.energy = 0; 0065 } 0066 } 0067 } 0068 * \endcode 0069 * 0070 * You *cannot* safely access the current size of the stack in the same kernel 0071 * that's modifying it -- if the stack attempts to allocate beyond the end, 0072 * then the \c size() call will reflect that overflowed state, rather than the 0073 * corrected size reflecting the failed allocation. 0074 * 0075 * A third kernel with a single thread would then be responsible for clearing 0076 * the data: 0077 * \code 0078 __global__ clear_stack(const SecondaryRef ptrs) 0079 { 0080 StackAllocator<Secondary> allocate(ptrs); 0081 auto thread_idx = celeritas::KernelParamCalculator::thread_id().get(); 0082 if (thread_idx == 0) 0083 { 0084 allocate.clear(); 0085 } 0086 } 0087 * \endcode 0088 * 0089 * These separate kernel launches are needed as grid-level synchronization 0090 * points. 0091 * 0092 * \todo Instead of returning a pointer, return IdRange<T>. Rename 0093 * StackAllocatorData to StackAllocation and have it look like a collection so 0094 * that *it* will provide access to the data. Better yet, have a 0095 * StackAllocation that can be a `const_reference` to the StackAllocatorData. 0096 * Then the rule will be "you can't create a StackAllocator in the same kernel 0097 * that you directly access a StackAllocation". 0098 */ 0099 template<class T> 0100 class StackAllocator 0101 { 0102 public: 0103 //!@{ 0104 //! \name Type aliases 0105 using value_type = T; 0106 using result_type = value_type*; 0107 using Data = StackAllocatorData<T, Ownership::reference, MemSpace::native>; 0108 //!@} 0109 0110 public: 0111 // Construct with shared data 0112 explicit inline CELER_FUNCTION StackAllocator(Data const& data); 0113 0114 // Total storage capacity (always safe) 0115 inline CELER_FUNCTION size_type capacity() const; 0116 0117 //// INITIALIZE //// 0118 0119 // Reset storage 0120 inline CELER_FUNCTION void clear(); 0121 0122 //// ALLOCATE //// 0123 0124 // Allocate space for this many data 0125 inline CELER_FUNCTION result_type operator()(size_type count); 0126 0127 //// ACCESS //// 0128 0129 // Current size 0130 inline CELER_FUNCTION size_type size() const; 0131 0132 // View all allocated data 0133 inline CELER_FUNCTION Span<value_type> get(); 0134 inline CELER_FUNCTION Span<value_type const> get() const; 0135 0136 private: 0137 Data const& data_; 0138 0139 //// HELPER FUNCTIONS //// 0140 0141 using SizeId = ItemId<size_type>; 0142 using StorageId = ItemId<T>; 0143 static CELER_CONSTEXPR_FUNCTION SizeId size_id() { return SizeId{0}; } 0144 }; 0145 0146 //---------------------------------------------------------------------------// 0147 // INLINE DEFINITIONS 0148 //---------------------------------------------------------------------------// 0149 /*! 0150 * Construct with defaults. 0151 */ 0152 template<class T> 0153 CELER_FUNCTION StackAllocator<T>::StackAllocator(Data const& shared) 0154 : data_(shared) 0155 { 0156 CELER_EXPECT(shared); 0157 } 0158 0159 //---------------------------------------------------------------------------// 0160 /*! 0161 * Get the maximum number of values that can be allocated. 0162 */ 0163 template<class T> 0164 CELER_FUNCTION auto StackAllocator<T>::capacity() const -> size_type 0165 { 0166 return data_.storage.size(); 0167 } 0168 0169 //---------------------------------------------------------------------------// 0170 /*! 0171 * Clear the stack allocator. 0172 * 0173 * This sets the size to zero. It should ideally *only* be called by a single 0174 * thread (though multiple threads resetting it should also be OK), but 0175 * *cannot be used in the same kernel that is allocating or viewing it*. This 0176 * is because the access times between different threads or thread-blocks is 0177 * indeterminate inside of a single kernel. 0178 */ 0179 template<class T> 0180 CELER_FUNCTION void StackAllocator<T>::clear() 0181 { 0182 data_.size[this->size_id()] = 0; 0183 } 0184 0185 //---------------------------------------------------------------------------// 0186 /*! 0187 * Allocate space for a given number of items. 0188 * 0189 * Returns NULL if allocation failed due to out-of-memory. Ensures that the 0190 * shared size reflects the amount of data allocated. 0191 */ 0192 template<class T> 0193 CELER_FUNCTION auto 0194 StackAllocator<T>::operator()(size_type count) -> result_type 0195 { 0196 CELER_EXPECT(count > 0); 0197 0198 // Atomic add 'count' to the shared size 0199 size_type start = atomic_add(&data_.size[this->size_id()], count); 0200 if (CELER_UNLIKELY(start + count > data_.storage.size())) 0201 { 0202 // Out of memory: restore the old value so that another thread can 0203 // potentially use it. Multiple threads are likely to exceed the 0204 // capacity simultaneously. Only one has a "start" value less than or 0205 // equal to the total capacity: the remainder are (arbitrarily) higher 0206 // than that. 0207 if (start <= this->capacity()) 0208 { 0209 // We were the first thread to exceed capacity, even though other 0210 // threads might have failed (and might still be failing) to 0211 // allocate. Restore the actual allocated size to the start value. 0212 // This might allow another thread with a smaller allocation to 0213 // succeed, but it also guarantees that at the end of the kernel, 0214 // the size reflects the actual capacity. 0215 data_.size[this->size_id()] = start; 0216 } 0217 0218 /*! 0219 * \todo It might be useful to set an "out of memory" flag to make it 0220 * easier for host code to detect whether a failure occurred, rather 0221 * than looping through primaries and testing for failure. 0222 */ 0223 0224 // Return null pointer, indicating failure to allocate. 0225 return nullptr; 0226 } 0227 0228 // Initialize the data at the newly "allocated" address 0229 value_type* result = new (&data_.storage[StorageId{start}]) value_type; 0230 for (size_type i = 1; i < count; ++i) 0231 { 0232 // Initialize remaining values 0233 CELER_ASSERT(&data_.storage[StorageId{start + i}] == result + i); 0234 new (&data_.storage[StorageId{start + i}]) value_type; 0235 } 0236 return result; 0237 } 0238 0239 //---------------------------------------------------------------------------// 0240 /*! 0241 * Get the number of items currently present. 0242 * 0243 * This value may not be meaningful (may be less than "actual" size) if 0244 * called in the same kernel as other threads that are allocating. 0245 */ 0246 template<class T> 0247 CELER_FUNCTION auto StackAllocator<T>::size() const -> size_type 0248 { 0249 size_type result = data_.size[this->size_id()]; 0250 CELER_ENSURE(result <= this->capacity()); 0251 return result; 0252 } 0253 0254 //---------------------------------------------------------------------------// 0255 /*! 0256 * View all allocated data. 0257 * 0258 * This cannot be called while any running kernel could be modifiying the size. 0259 */ 0260 template<class T> 0261 CELER_FUNCTION auto StackAllocator<T>::get() -> Span<value_type> 0262 { 0263 return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}]; 0264 } 0265 0266 //---------------------------------------------------------------------------// 0267 /*! 0268 * View all allocated data (const). 0269 * 0270 * This cannot be called while any running kernel could be modifiying the size. 0271 */ 0272 template<class T> 0273 CELER_FUNCTION auto StackAllocator<T>::get() const -> Span<value_type const> 0274 { 0275 return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}]; 0276 } 0277 0278 //---------------------------------------------------------------------------// 0279 } // namespace celeritas
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |