File indexing completed on 2026-04-09 07:49:20
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #include <vector>
0019 #include "scuda.h"
0020 #include "squad.h"
0021
0022
0023 #include <thrust/copy.h>
0024 #include <thrust/count.h>
0025 #include <thrust/device_ptr.h>
0026
0027 template<typename T>
0028 unsigned select_count( T* d, unsigned num_d, qselector<T>& selector )
0029 {
0030 thrust::device_ptr<T> td(d);
0031 return thrust::count_if(td, td+num_d , selector );
0032 }
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044 template<typename T>
0045 void select_copy_device_to_host( T** h, unsigned& num_select, T* d, unsigned num_d, const qselector<T>& selector )
0046 {
0047 thrust::device_ptr<T> td(d);
0048 num_select = thrust::count_if(td, td+num_d , selector );
0049 std::cout << " num_select " << num_select << std::endl ;
0050
0051 T* d_select ;
0052 cudaMalloc(&d_select, num_select*sizeof(T));
0053
0054 thrust::device_ptr<T> td_select(d_select);
0055
0056 thrust::copy_if(td, td+num_d , td_select, selector );
0057
0058 *h = new T[num_select] ;
0059 cudaMemcpy(*h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0060 }
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071 template<typename T>
0072 void select_copy_device_to_host_presized( T* h, T* d, unsigned num_d, const qselector<T>& selector, unsigned num_select )
0073 {
0074 thrust::device_ptr<T> td(d);
0075
0076 T* d_select ;
0077 cudaMalloc(&d_select, num_select*sizeof(T));
0078
0079 thrust::device_ptr<T> td_select(d_select);
0080
0081 thrust::copy_if(td, td+num_d , td_select, selector );
0082
0083 cudaMemcpy(h, d_select, num_select*sizeof(T), cudaMemcpyDeviceToHost);
0084 }
0085
0086 void populate( quad4* pp, unsigned num_p, unsigned mask )
0087 {
0088 for(unsigned i=0 ; i < num_p ; i++)
0089 {
0090 quad4& p = pp[i];
0091 p.zero();
0092
0093 p.q0.f.x = float(i*1000) ;
0094 p.q3.u.x = i ;
0095 p.q3.u.w = i % 3 == 0 ? mask : i ;
0096 }
0097 }
0098
0099 void dump( const quad4* pp, unsigned num_p )
0100 {
0101 std::cout << " dump num_p:" << num_p << std::endl ;
0102 for(unsigned i=0 ; i < num_p ; i++)
0103 {
0104 const quad4& h = pp[i];
0105 std::cout
0106 << " h "
0107 << h.q3.u.x << " "
0108 << h.q3.u.y << " "
0109 << h.q3.u.z << " "
0110 << h.q3.u.w << " "
0111 << std::endl
0112 ;
0113 }
0114 }
0115
0116 template<typename T>
0117 T* upload(const T* h, unsigned num_items )
0118 {
0119 T* d ;
0120 cudaMalloc(&d, num_items*sizeof(T));
0121 cudaMemcpy(d, h, num_items*sizeof(T), cudaMemcpyHostToDevice);
0122 return d ;
0123 }
0124
0125 void test_monolithic()
0126 {
0127 std::vector<quad4> pp(10) ;
0128 unsigned mask = 0xbeefcafe ;
0129 populate(pp.data(), pp.size(), mask);
0130
0131 unsigned num_p = pp.size();
0132 quad4* d_pp = upload(pp.data(), num_p);
0133
0134 quad4* hit ;
0135 unsigned num_hit ;
0136 qselector<quad4> selector(mask);
0137
0138 select_copy_device_to_host( &hit, num_hit, d_pp, num_p, selector );
0139
0140 dump( hit, num_hit );
0141 }
0142
0143 void test_presized()
0144 {
0145
0146
0147 std::vector<quad4> pp(10) ;
0148 unsigned mask = 0xbeefcafe ;
0149 populate(pp.data(), pp.size(), mask);
0150
0151 unsigned num_p = pp.size();
0152 quad4* d_pp = upload(pp.data(), num_p);
0153
0154 qselector<quad4> selector(mask);
0155 unsigned num_hit = select_count( d_pp, num_p, selector );
0156 std::cout << " num_hit " << num_hit << std::endl ;
0157
0158 quad4* hit = new quad4[num_hit] ;
0159 select_copy_device_to_host_presized( hit, d_pp, num_p, selector, num_hit );
0160
0161 dump( hit, num_hit );
0162 }
0163
0164
0165 int main()
0166 {
0167
0168 test_presized();
0169
0170 return 0 ;
0171 }