Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:20

0001 /**
0002 
0003 ~/o/sysrap/tests/squadSelectTest.sh
0004 
0005 
0006 1. setup pp buffer on device
0007 2. thrust::count_if get the number of hits
0008 3. allocate device buffer for hits
0009 4. thrust::copy_if between the pp and hit buffers
0010 5. copy hits down to host   
0011 
0012 Q: can thrust::count_if copy from device buffer to host buffer without the intermediate device buffer ?
0013 A: Crovella2016:NO https://stackoverflow.com/questions/36877029/thrust-copy-if-device-to-host
0014 
0015 **/
0016 
0017 
0018 #include <vector>
0019 #include "scuda.h"
0020 #include "squad.h"
0021 
0022 
0023 #include <thrust/copy.h>
0024 #include <thrust/count.h>
0025 #include <thrust/device_ptr.h>
0026 
0027 template<typename T>
0028 unsigned select_count( T* d, unsigned num_d,  qselector<T>& selector )
0029 {
0030     thrust::device_ptr<T> td(d);
0031     return thrust::count_if(td, td+num_d , selector );
0032 }
0033 
0034 /**
0035 select_copy_device_to_host
0036 ----------------------------
0037 
0038 This API is awkward because the number selected is not known when making the call.
0039 For example it would be difficult to populate an NP array using this without 
0040 making copies. 
0041 
0042 **/
0043 
0044 template<typename T>
0045 void select_copy_device_to_host( T** h, unsigned& num_select,  T* d, unsigned num_d, const qselector<T>& selector  )
0046 {
0047     thrust::device_ptr<T> td(d);
0048     num_select = thrust::count_if(td, td+num_d , selector );
0049     std::cout << " num_select " << num_select << std::endl ; 
0050 
0051     T* d_select ;   
0052     cudaMalloc(&d_select,     num_select*sizeof(T));
0053     //cudaMemset(d_select, 0,   num_select*sizeof(T));
0054     thrust::device_ptr<T> td_select(d_select);  
0055 
0056     thrust::copy_if(td, td+num_d , td_select, selector );
0057 
0058     *h = new T[num_select] ; 
0059     cudaMemcpy(*h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0060 }
0061 
0062 
0063 /**
0064 select_copy_device_to_host_presized
0065 --------------------------------------
0066 
0067 The host array must be presized to fit the selection, do so using *select_count* with the same selector. 
0068 
0069 **/
0070 
0071 template<typename T>
0072 void select_copy_device_to_host_presized( T* h, T* d, unsigned num_d, const qselector<T>& selector, unsigned num_select  )
0073 {
0074     thrust::device_ptr<T> td(d);
0075 
0076     T* d_select ;   
0077     cudaMalloc(&d_select,     num_select*sizeof(T));
0078     //cudaMemset(d_select, 0,   num_select*sizeof(T));
0079     thrust::device_ptr<T> td_select(d_select);  
0080 
0081     thrust::copy_if(td, td+num_d , td_select, selector );
0082 
0083     cudaMemcpy(h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0084 }
0085 
0086 void populate( quad4* pp, unsigned num_p, unsigned mask )
0087 {
0088     for(unsigned i=0 ; i < num_p ; i++)
0089     {
0090         quad4& p = pp[i]; 
0091         p.zero(); 
0092 
0093         p.q0.f.x = float(i*1000) ; 
0094         p.q3.u.x = i ; 
0095         p.q3.u.w = i % 3 == 0 ? mask : i  ; 
0096     }
0097 }
0098 
0099 void dump( const quad4* pp, unsigned num_p )
0100 {
0101     std::cout << " dump num_p:" << num_p << std::endl ; 
0102     for(unsigned i=0 ; i < num_p ; i++)
0103     {
0104         const quad4& h = pp[i]; 
0105         std::cout 
0106              << " h " 
0107              << h.q3.u.x << " "  
0108              << h.q3.u.y << " "  
0109              << h.q3.u.z << " "  
0110              << h.q3.u.w << " "  
0111              << std::endl 
0112              ; 
0113     }
0114 }
0115 
0116 template<typename T>
0117 T* upload(const T* h, unsigned num_items )
0118 {
0119     T* d ;
0120     cudaMalloc(&d, num_items*sizeof(T));
0121     cudaMemcpy(d, h, num_items*sizeof(T), cudaMemcpyHostToDevice);
0122     return d ; 
0123 }
0124 
0125 void test_monolithic()
0126 {
0127     std::vector<quad4> pp(10) ; 
0128     unsigned mask = 0xbeefcafe ; 
0129     populate(pp.data(), pp.size(), mask); 
0130 
0131     unsigned num_p = pp.size(); 
0132     quad4* d_pp = upload(pp.data(), num_p);   
0133 
0134     quad4* hit ; 
0135     unsigned num_hit ; 
0136     qselector<quad4> selector(mask); 
0137 
0138     select_copy_device_to_host( &hit, num_hit, d_pp, num_p, selector );  
0139 
0140     dump( hit, num_hit );     
0141 }
0142 
0143 void test_presized()
0144 {
0145     // obtain the hit count, presize host array then copy
0146 
0147     std::vector<quad4> pp(10) ; 
0148     unsigned mask = 0xbeefcafe ; 
0149     populate(pp.data(), pp.size(), mask); 
0150 
0151     unsigned num_p = pp.size(); 
0152     quad4* d_pp = upload(pp.data(), num_p);   
0153 
0154     qselector<quad4> selector(mask); 
0155     unsigned num_hit = select_count( d_pp, num_p, selector ); 
0156     std::cout << " num_hit " << num_hit << std::endl ; 
0157 
0158     quad4* hit = new quad4[num_hit] ; 
0159     select_copy_device_to_host_presized( hit, d_pp, num_p, selector, num_hit ); 
0160 
0161     dump( hit, num_hit );     
0162 }
0163 
0164 
0165 int main()
0166 {
0167     //test_monolithic();
0168     test_presized(); 
0169 
0170     return 0 ; 
0171 }