corecel/data/StackAllocator.hh

0001 //------------------------------- -*- C++ -*- -------------------------------//
0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0004 //---------------------------------------------------------------------------//
0005 //! \file corecel/data/StackAllocator.hh
0006 //---------------------------------------------------------------------------//
0007 #pragma once
0008
0009 #include <new>
0010
0011 #include "corecel/math/Atomics.hh"
0012
0013 #include "StackAllocatorData.hh"
0014
0015 namespace celeritas
0016 {
0017 //---------------------------------------------------------------------------//
0018 /*!
0019  * Dynamically allocate arbitrary data on a stack.
0020  *
0021  * The stack allocator view acts as a functor and accessor to the allocated
0022  * data. It enables very fast on-device dynamic allocation of data, such as
0023  * secondaries or detector hits. As an example, inside a hypothetical physics
0024  * Interactor class, you could create two particles with the following code:
0025  * \code
0026
0027  struct Interactor
0028  {
0029     StackAllocator<Secondary> allocate;
0030
0031     // Sample an interaction
0032     template<class Engine>
0033     Interaction operator()(Engine&)
0034     {
0035        // Create 2 secondary particles
0036        Secondary* allocated = this->allocate(2);
0037        if (!allocated)
0038        {
0039            return Interaction::from_failure();
0040        }
0041        Interaction result;
0042        result.secondaries = Span<Secondary>{allocated, 2};
0043        return result;
0044     };
0045  };
0046  \endcode
0047  *
0048  * A later kernel could then iterate over the secondaries to apply cutoffs:
0049  * \code
0050  using SecondaryRef
0051      = StackAllocatorData<Secondary, Ownership::reference, MemSpace::device>;
0052
0053  __global__ apply_cutoff(const SecondaryRef ptrs)
0054  {
0055      auto thread_idx = celeritas::KernelParamCalculator::thread_id().get();
0056      StackAllocator<Secondary> allocate(ptrs);
0057      auto secondaries = allocate.get();
0058      if (thread_idx < secondaries.size())
0059      {
0060          Secondary& sec = secondaries[thread_idx];
0061          if (sec.energy < 100 * units::kilo_electron_volts)
0062          {
0063              sec.energy = 0;
0064          }
0065      }
0066  }
0067  * \endcode
0068  *
0069  * You *cannot* safely access the current size of the stack in the same kernel
0070  * that's modifying it -- if the stack attempts to allocate beyond the end,
0071  * then the \c size() call will reflect that overflowed state, rather than the
0072  * corrected size reflecting the failed allocation.
0073  *
0074  * A third kernel with a single thread would then be responsible for clearing
0075  * the data:
0076  * \code
0077  __global__ clear_stack(const SecondaryRef ptrs)
0078  {
0079      StackAllocator<Secondary> allocate(ptrs);
0080      auto thread_idx = celeritas::KernelParamCalculator::thread_id().get();
0081      if (thread_idx == 0)
0082      {
0083          allocate.clear();
0084      }
0085  }
0086  * \endcode
0087  *
0088  * These separate kernel launches are needed as grid-level synchronization
0089  * points.
0090  *
0091  * \todo Instead of returning a pointer, return IdRange<T>. Rename
0092  * StackAllocatorData to StackAllocation and have it look like a collection so
0093  * that *it* will provide access to the data. Better yet, have a
0094  * StackAllocation that can be a `const_reference` to the StackAllocatorData.
0095  * Then the rule will be "you can't create a StackAllocator in the same kernel
0096  * that you directly access a StackAllocation".
0097  */
0098 template<class T>
0099 class StackAllocator
0100 {
0101   public:
0102     //!@{
0103     //! \name Type aliases
0104     using value_type = T;
0105     using result_type = value_type*;
0106     using Data = StackAllocatorData<T, Ownership::reference, MemSpace::native>;
0107     //!@}
0108
0109   public:
0110     // Construct with shared data
0111     explicit inline CELER_FUNCTION StackAllocator(Data const& data);
0112
0113     // Total storage capacity (always safe)
0114     inline CELER_FUNCTION size_type capacity() const;
0115
0116     //// INITIALIZE ////
0117
0118     // Reset storage
0119     inline CELER_FUNCTION void clear();
0120
0121     //// ALLOCATE ////
0122
0123     // Allocate space for this many data
0124     inline CELER_FUNCTION result_type operator()(size_type count);
0125
0126     //// ACCESS ////
0127
0128     // Current size
0129     inline CELER_FUNCTION size_type size() const;
0130
0131     // View all allocated data
0132     inline CELER_FUNCTION Span<value_type> get();
0133     inline CELER_FUNCTION Span<value_type const> get() const;
0134
0135   private:
0136     Data const& data_;
0137
0138     //// HELPER FUNCTIONS ////
0139
0140     using SizeId = ItemId<size_type>;
0141     using StorageId = ItemId<T>;
0142     static CELER_CONSTEXPR_FUNCTION SizeId size_id() { return SizeId{0}; }
0143 };
0144
0145 //---------------------------------------------------------------------------//
0146 // INLINE DEFINITIONS
0147 //---------------------------------------------------------------------------//
0148 /*!
0149  * Construct with defaults.
0150  */
0151 template<class T>
0152 CELER_FUNCTION StackAllocator<T>::StackAllocator(Data const& shared)
0153     : data_(shared)
0154 {
0155     CELER_EXPECT(shared);
0156 }
0157
0158 //---------------------------------------------------------------------------//
0159 /*!
0160  * Get the maximum number of values that can be allocated.
0161  */
0162 template<class T>
0163 CELER_FUNCTION auto StackAllocator<T>::capacity() const -> size_type
0164 {
0165     return data_.storage.size();
0166 }
0167
0168 //---------------------------------------------------------------------------//
0169 /*!
0170  * Clear the stack allocator.
0171  *
0172  * This sets the size to zero. It should ideally *only* be called by a single
0173  * thread (though multiple threads resetting it should also be OK), but
0174  * *cannot be used in the same kernel that is allocating or viewing it*. This
0175  * is because the access times between different threads or thread-blocks is
0176  * indeterminate inside of a single kernel.
0177  */
0178 template<class T>
0179 CELER_FUNCTION void StackAllocator<T>::clear()
0180 {
0181     data_.size[this->size_id()] = 0;
0182 }
0183
0184 //---------------------------------------------------------------------------//
0185 /*!
0186  * Allocate space for a given number of items.
0187  *
0188  * Returns NULL if allocation failed due to out-of-memory. Ensures that the
0189  * shared size reflects the amount of data allocated.
0190  */
0191 template<class T>
0192 CELER_FUNCTION auto StackAllocator<T>::operator()(size_type count)
0193     -> result_type
0194 {
0195     CELER_EXPECT(count > 0);
0196
0197     // Atomic add 'count' to the shared size
0198     size_type start = atomic_add(&data_.size[this->size_id()], count);
0199     if (CELER_UNLIKELY(start + count > data_.storage.size()))
0200     {
0201         // Out of memory: restore the old value so that another thread can
0202         // potentially use it. Multiple threads are likely to exceed the
0203         // capacity simultaneously. Only one has a "start" value less than or
0204         // equal to the total capacity: the remainder are (arbitrarily) higher
0205         // than that.
0206         if (start <= this->capacity())
0207         {
0208             // We were the first thread to exceed capacity, even though other
0209             // threads might have failed (and might still be failing) to
0210             // allocate. Restore the actual allocated size to the start value.
0211             // This might allow another thread with a smaller allocation to
0212             // succeed, but it also guarantees that at the end of the kernel,
0213             // the size reflects the actual capacity.
0214             data_.size[this->size_id()] = start;
0215         }
0216
0217         /*!
0218          * \todo It might be useful to set an "out of memory" flag to make it
0219          * easier for host code to detect whether a failure occurred, rather
0220          * than looping through primaries and testing for failure.
0221          */
0222
0223         // Return null pointer, indicating failure to allocate.
0224         return nullptr;
0225     }
0226
0227     // Initialize the data at the newly "allocated" address
0228     value_type* result = new (&data_.storage[StorageId{start}]) value_type;
0229     for (size_type i = 1; i < count; ++i)
0230     {
0231         // Initialize remaining values
0232         CELER_ASSERT(&data_.storage[StorageId{start + i}] == result + i);
0233         new (&data_.storage[StorageId{start + i}]) value_type;
0234     }
0235     return result;
0236 }
0237
0238 //---------------------------------------------------------------------------//
0239 /*!
0240  * Get the number of items currently present.
0241  *
0242  * This value may not be meaningful (may be less than "actual" size) if
0243  * called in the same kernel as other threads that are allocating.
0244  */
0245 template<class T>
0246 CELER_FUNCTION auto StackAllocator<T>::size() const -> size_type
0247 {
0248     size_type result = data_.size[this->size_id()];
0249     CELER_ENSURE(result <= this->capacity());
0250     return result;
0251 }
0252
0253 //---------------------------------------------------------------------------//
0254 /*!
0255  * View all allocated data.
0256  *
0257  * This cannot be called while any running kernel could be modifying the size.
0258  */
0259 template<class T>
0260 CELER_FUNCTION auto StackAllocator<T>::get() -> Span<value_type>
0261 {
0262     return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}];
0263 }
0264
0265 //---------------------------------------------------------------------------//
0266 /*!
0267  * View all allocated data (const).
0268  *
0269  * This cannot be called while any running kernel could be modifying the size.
0270  */
0271 template<class T>
0272 CELER_FUNCTION auto StackAllocator<T>::get() const -> Span<value_type const>
0273 {
0274     return data_.storage[ItemRange<T>{StorageId{0}, StorageId{this->size()}}];
0275 }
0276
0277 //---------------------------------------------------------------------------//
0278 }  // namespace celeritas