Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:13

0001 #pragma once
0002 /**
0003 curandStatePhilox4_32_10_OpticksLite.h
0004 ========================================
0005 
0006 Experiment with counter based RNG, see notes::
0007 
0008     ~/o/notes/curand-impl-review-and-compare-to-curand-done-right.rst
0009 
0010 This specializes the curandStatePhilox4_32_10 impl to use minimal counter only state::
0011 
0012    /usr/local/cuda/include/curand_kernel.h
0013    /usr/local/cuda/include/curand_philox4x32_x.h
0014 
0015 As inspired by::
0016 
0017     https://github.com/kshitijl/curand-done-right
0018 
0019 +---------------------------------------+----------------+--------------------------------------------------+
0020 |                                       |  sizeof bytes  |   notes                                          |
0021 +=======================================+================+==================================================+
0022 | curandStateXORWOW                     |    48          |  curand default, expensive init => complications |
0023 +---------------------------------------+----------------+--------------------------------------------------+
0024 | curandStatePhilox4_32_10              |    64          |  cheap init (TODO: check in practice)            |
0025 +---------------------------------------+----------------+--------------------------------------------------+
0026 | curandStatePhilox4_32_10_OpticksLite  |    32          |  slim state to uint4 + uint2, gets padded to 32  |
0027 +---------------------------------------+----------------+--------------------------------------------------+
0028 
0029 See LICENSE.txt for usage conditions.
0030 
0031 Related tests::
0032 
0033    ~/o/sysrap/tests/curand_uniform_test.sh
0034    ~/o/sysrap/tests/curanddr_uniform_test.sh
0035 
0036 
0037 **/
0038 
0039 #include "curand_kernel.h"
0040 
0041 
0042 struct curandStatePhilox4_32_10_OpticksLite
0043 {
0044     uint4 ctr ; 
0045     uint2 key ;   
0046     // looks like 6*4=24 bytes, but gets padded to 32 bytes
0047 };
0048 
0049 QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_OpticksLite* s)
0050 {
0051    if(++s->ctr.x) return;
0052    if(++s->ctr.y) return;
0053    if(++s->ctr.z) return;
0054    ++s->ctr.w;
0055 }
0056 
0057 QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_OpticksLite* s, unsigned long long n)
0058 {
0059    unsigned int nlo = (unsigned int)(n);
0060    unsigned int nhi = (unsigned int)(n>>32);
0061 
0062    s->ctr.x += nlo;
0063    if( s->ctr.x < nlo )
0064       nhi++;
0065 
0066    s->ctr.y += nhi;
0067    if(nhi <= s->ctr.y)
0068       return;
0069    if(++s->ctr.z) return;
0070    ++s->ctr.w;
0071 }
0072 
0073 QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_OpticksLite* s, unsigned long long n)
0074 {
0075    unsigned int nlo = (unsigned int)(n);
0076    unsigned int nhi = (unsigned int)(n>>32);
0077 
0078    s->ctr.z += nlo;
0079    if( s->ctr.z < nlo )
0080       nhi++;
0081 
0082    s->ctr.w += nhi;
0083 }
0084 
0085 QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_OpticksLite* s)
0086 {
0087     Philox_State_Incr_hi(s, n);
0088 }
0089 
0090 QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_OpticksLite* s)
0091 {
0092     Philox_State_Incr(s, n);
0093 }
0094 
0095 
0096 QUALIFIERS void curand_init( unsigned long long seed,
0097                              unsigned long long subsequence,
0098                              unsigned long long offset,
0099                              curandStatePhilox4_32_10_OpticksLite* s )
0100 {
0101     s->ctr = make_uint4(0, 0, 0, 0);
0102     s->key.x = (unsigned int)seed;
0103     s->key.y = (unsigned int)(seed>>32);
0104 
0105     skipahead_sequence(subsequence, s);
0106     skipahead(offset, s);
0107 }
0108 
0109 QUALIFIERS float4 curand_uniform4( curandStatePhilox4_32_10_OpticksLite* s )
0110 {
0111    uint4 result = curand_Philox4x32_10(s->ctr, s->key);  
0112    Philox_State_Incr(s); 
0113    return _curand_uniform4(result) ; 
0114 }
0115 /**
0116 curand_uniform
0117 -----------------
0118 
0119 This wastefully only uses 1 of the 4 uint generated, 
0120 prefer instead to use curand_uniform4
0121 
0122 **/
0123 QUALIFIERS float curand_uniform( curandStatePhilox4_32_10_OpticksLite* s )
0124 {
0125    uint4 result = curand_Philox4x32_10(s->ctr, s->key);  
0126    Philox_State_Incr(s); 
0127    return _curand_uniform(result.x) ;    
0128 }
0129 
0130 
0131 /**
0132 curand_uniform4(curandStateXORWOW* state)
0133 -------------------------------------------
0134 
0135 API missed from XORWOW, added to allow templated 
0136 tests comparing between curandState types::
0137 
0138     curandStateXORWOW
0139     curandStatePhilox4_32_10
0140     curandStatePhilox4_32_10_OpticksLite
0141 
0142 **/
0143 
0144 QUALIFIERS float4 curand_uniform4(curandStateXORWOW* state)
0145 { 
0146     float4 result ; 
0147     result.x = curand_uniform(state); 
0148     result.y = curand_uniform(state); 
0149     result.z = curand_uniform(state); 
0150     result.w = curand_uniform(state); 
0151     return result ; 
0152 }
0153