sysrap/tests/squadSelectTest.cu

0001 /**
0002
0003 ~/o/sysrap/tests/squadSelectTest.sh
0004
0005
0006 1. setup pp buffer on device
0007 2. thrust::count_if get the number of hits
0008 3. allocate device buffer for hits
0009 4. thrust::copy_if between the pp and hit buffers
0010 5. copy hits down to host
0011
0012 Q: can thrust::count_if copy from device buffer to host buffer without the intermediate device buffer ?
0013 A: Crovella2016:NO https://stackoverflow.com/questions/36877029/thrust-copy-if-device-to-host
0014
0015 **/
0016
0017
0018 #include <vector>
0019 #include "scuda.h"
0020 #include "squad.h"
0021
0022
0023 #include <thrust/copy.h>
0024 #include <thrust/count.h>
0025 #include <thrust/device_ptr.h>
0026
0027 template<typename T>
0028 unsigned select_count( T* d, unsigned num_d,  qselector<T>& selector )
0029 {
0030     thrust::device_ptr<T> td(d);
0031     return thrust::count_if(td, td+num_d , selector );
0032 }
0033
0034 /**
0035 select_copy_device_to_host
0036 ----------------------------
0037
0038 This API is awkward because the number selected is not known when making the call.
0039 For example it would be difficult to populate an NP array using this without
0040 making copies.
0041
0042 **/
0043
0044 template<typename T>
0045 void select_copy_device_to_host( T** h, unsigned& num_select,  T* d, unsigned num_d, const qselector<T>& selector  )
0046 {
0047     thrust::device_ptr<T> td(d);
0048     num_select = thrust::count_if(td, td+num_d , selector );
0049     std::cout << " num_select " << num_select << std::endl ;
0050
0051     T* d_select ;
0052     cudaMalloc(&d_select,     num_select*sizeof(T));
0053     //cudaMemset(d_select, 0,   num_select*sizeof(T));
0054     thrust::device_ptr<T> td_select(d_select);
0055
0056     thrust::copy_if(td, td+num_d , td_select, selector );
0057
0058     *h = new T[num_select] ;
0059     cudaMemcpy(*h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0060 }
0061
0062
0063 /**
0064 select_copy_device_to_host_presized
0065 --------------------------------------
0066
0067 The host array must be presized to fit the selection, do so using *select_count* with the same selector.
0068
0069 **/
0070
0071 template<typename T>
0072 void select_copy_device_to_host_presized( T* h, T* d, unsigned num_d, const qselector<T>& selector, unsigned num_select  )
0073 {
0074     thrust::device_ptr<T> td(d);
0075
0076     T* d_select ;
0077     cudaMalloc(&d_select,     num_select*sizeof(T));
0078     //cudaMemset(d_select, 0,   num_select*sizeof(T));
0079     thrust::device_ptr<T> td_select(d_select);
0080
0081     thrust::copy_if(td, td+num_d , td_select, selector );
0082
0083     cudaMemcpy(h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0084 }
0085
0086 void populate( quad4* pp, unsigned num_p, unsigned mask )
0087 {
0088     for(unsigned i=0 ; i < num_p ; i++)
0089     {
0090         quad4& p = pp[i];
0091         p.zero();
0092
0093         p.q0.f.x = float(i*1000) ;
0094         p.q3.u.x = i ;
0095         p.q3.u.w = i % 3 == 0 ? mask : i  ;
0096     }
0097 }
0098
0099 void dump( const quad4* pp, unsigned num_p )
0100 {
0101     std::cout << " dump num_p:" << num_p << std::endl ;
0102     for(unsigned i=0 ; i < num_p ; i++)
0103     {
0104         const quad4& h = pp[i];
0105         std::cout
0106              << " h "
0107              << h.q3.u.x << " "
0108              << h.q3.u.y << " "
0109              << h.q3.u.z << " "
0110              << h.q3.u.w << " "
0111              << std::endl
0112              ;
0113     }
0114 }
0115
0116 template<typename T>
0117 T* upload(const T* h, unsigned num_items )
0118 {
0119     T* d ;
0120     cudaMalloc(&d, num_items*sizeof(T));
0121     cudaMemcpy(d, h, num_items*sizeof(T), cudaMemcpyHostToDevice);
0122     return d ;
0123 }
0124
0125 void test_monolithic()
0126 {
0127     std::vector<quad4> pp(10) ;
0128     unsigned mask = 0xbeefcafe ;
0129     populate(pp.data(), pp.size(), mask);
0130
0131     unsigned num_p = pp.size();
0132     quad4* d_pp = upload(pp.data(), num_p);
0133
0134     quad4* hit ;
0135     unsigned num_hit ;
0136     qselector<quad4> selector(mask);
0137
0138     select_copy_device_to_host( &hit, num_hit, d_pp, num_p, selector );
0139
0140     dump( hit, num_hit );
0141 }
0142
0143 void test_presized()
0144 {
0145     // obtain the hit count, presize host array then copy
0146
0147     std::vector<quad4> pp(10) ;
0148     unsigned mask = 0xbeefcafe ;
0149     populate(pp.data(), pp.size(), mask);
0150
0151     unsigned num_p = pp.size();
0152     quad4* d_pp = upload(pp.data(), num_p);
0153
0154     qselector<quad4> selector(mask);
0155     unsigned num_hit = select_count( d_pp, num_p, selector );
0156     std::cout << " num_hit " << num_hit << std::endl ;
0157
0158     quad4* hit = new quad4[num_hit] ;
0159     select_copy_device_to_host_presized( hit, d_pp, num_p, selector, num_hit );
0160
0161     dump( hit, num_hit );
0162 }
0163
0164
0165 int main()
0166 {
0167     //test_monolithic();
0168     test_presized();
0169
0170     return 0 ;
0171 }