Warning, file /include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
0016 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
0017 #include <unordered_set>
0018
0019 namespace Eigen {
0020
0021 namespace TensorSycl {
0022 namespace internal {
0023
0024
0025 struct SyclDeviceInfo {
0026 SyclDeviceInfo(cl::sycl::queue queue)
0027 : local_mem_type(
0028 queue.get_device()
0029 .template get_info<cl::sycl::info::device::local_mem_type>()),
0030 max_work_item_sizes(
0031 queue.get_device()
0032 .template get_info<
0033 cl::sycl::info::device::max_work_item_sizes>()),
0034 max_mem_alloc_size(
0035 queue.get_device()
0036 .template get_info<
0037 cl::sycl::info::device::max_mem_alloc_size>()),
0038 max_compute_units(queue.get_device()
0039 .template get_info<
0040 cl::sycl::info::device::max_compute_units>()),
0041 max_work_group_size(
0042 queue.get_device()
0043 .template get_info<
0044 cl::sycl::info::device::max_work_group_size>()),
0045 local_mem_size(
0046 queue.get_device()
0047 .template get_info<cl::sycl::info::device::local_mem_size>()),
0048 platform_name(queue.get_device()
0049 .get_platform()
0050 .template get_info<cl::sycl::info::platform::name>()),
0051 device_name(queue.get_device()
0052 .template get_info<cl::sycl::info::device::name>()),
0053 device_vendor(
0054 queue.get_device()
0055 .template get_info<cl::sycl::info::device::vendor>()) {}
0056
0057 cl::sycl::info::local_mem_type local_mem_type;
0058 cl::sycl::id<3> max_work_item_sizes;
0059 unsigned long max_mem_alloc_size;
0060 unsigned long max_compute_units;
0061 unsigned long max_work_group_size;
0062 size_t local_mem_size;
0063 std::string platform_name;
0064 std::string device_name;
0065 std::string device_vendor;
0066 };
0067
0068 }
0069 }
0070
0071 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
0072
0073
0074
0075 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
0076 -> decltype(cl::sycl::device::get_devices()) {
0077 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
0078 return {cl::sycl::device(cl::sycl::default_selector())};
0079 #else
0080 std::vector<cl::sycl::device> supported_devices;
0081 auto platform_list = cl::sycl::platform::get_platforms();
0082 for (const auto &platform : platform_list) {
0083 auto device_list = platform.get_devices();
0084 auto platform_name =
0085 platform.template get_info<cl::sycl::info::platform::name>();
0086 std::transform(platform_name.begin(), platform_name.end(),
0087 platform_name.begin(), ::tolower);
0088 for (const auto &device : device_list) {
0089 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
0090 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
0091 bool unsupported_condition =
0092 (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
0093 vendor.find("apu") == std::string::npos) ||
0094 (platform_name.find("experimental") != std::string::npos) ||
0095 device.is_host();
0096 if (!unsupported_condition) {
0097 supported_devices.push_back(device);
0098 }
0099 }
0100 }
0101 return supported_devices;
0102 #endif
0103 }
0104
0105 class QueueInterface {
0106 public:
0107
0108 template <typename DeviceOrSelector>
0109 explicit QueueInterface(
0110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
0111 unsigned num_threads = std::thread::hardware_concurrency())
0112 : m_queue(dev_or_sel, handler),
0113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
0115 #endif
0116 m_thread_pool(num_threads),
0117 m_device_info(m_queue) {
0118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0119 m_prog.build_with_kernel_type<DeviceOrSelector>();
0120 auto f = [&](cl::sycl::handler &cgh) {
0121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
0122 [=]() {})
0123 };
0124 EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
0125 #endif
0126 }
0127
0128 template <typename DeviceOrSelector>
0129 explicit QueueInterface(
0130 const DeviceOrSelector &dev_or_sel,
0131 unsigned num_threads = std::thread::hardware_concurrency())
0132 : QueueInterface(dev_or_sel,
0133 [this](cl::sycl::exception_list l) {
0134 this->exception_caught_ = this->sycl_async_handler(l);
0135 },
0136 num_threads) {}
0137
0138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0139 EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
0140 #endif
0141
0142
0143 EIGEN_STRONG_INLINE void *attach_buffer(
0144 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
0145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0146 return static_cast<void *>(pMapper.add_pointer(buf));
0147 }
0148
0149
0150 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
0151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0152 TensorSycl::internal::SYCLfree<false>(p, pMapper);
0153 }
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
0164 #if EIGEN_MAX_ALIGN_BYTES > 0
0165 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
0166 if (align > 0) {
0167 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
0168 }
0169 #endif
0170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0172 }
0173
0174 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
0175 #if EIGEN_MAX_ALIGN_BYTES > 0
0176 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
0177 if (align > 0) {
0178 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
0179 }
0180 #endif
0181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0183 if (scratch_buffers.empty()) {
0184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0185 ;
0186 } else {
0187 for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
0188 auto buff = pMapper.get_buffer(*it);
0189 if (buff.get_size() >= num_bytes) {
0190 auto ptr = *it;
0191 scratch_buffers.erase(it);
0192 return ptr;
0193 } else {
0194 ++it;
0195 }
0196 }
0197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0198 }
0199 #else
0200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0201 #endif
0202 }
0203 template <typename data_t>
0204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
0205 cl::sycl::access::mode::read_write, data_t>
0206 get(data_t *data) const {
0207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
0208 }
0209 template <typename data_t>
0210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
0211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
0212 data_t>
0213 data) const {
0214 return static_cast<data_t *>(data.get_virtual_pointer());
0215 }
0216
0217 EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
0218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0220 scratch_buffers.insert(p);
0221 #else
0222 TensorSycl::internal::SYCLfree(p, pMapper);
0223 #endif
0224 }
0225 template <cl::sycl::access::mode AcMd, typename T>
0226 EIGEN_STRONG_INLINE void deallocate_temp(
0227 const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
0228 deallocate_temp(p.get_virtual_pointer());
0229 }
0230
0231
0232
0233 EIGEN_STRONG_INLINE void deallocate(void *p) const {
0234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0235 TensorSycl::internal::SYCLfree(p, pMapper);
0236 }
0237
0238 EIGEN_STRONG_INLINE void deallocate_all() const {
0239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0240 TensorSycl::internal::SYCLfreeAll(pMapper);
0241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0242 scratch_buffers.clear();
0243 #endif
0244 }
0245
0246
0247
0248
0249
0250 EIGEN_STRONG_INLINE void memcpyHostToDevice(
0251 void *dst, const void *src, size_t n,
0252 std::function<void()> callback) const {
0253 static const auto write_mode = cl::sycl::access::mode::discard_write;
0254 static const auto global_access = cl::sycl::access::target::global_buffer;
0255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
0256 write_accessor;
0257 if (n == 0) {
0258 if (callback) callback();
0259 return;
0260 }
0261 n /= sizeof(buffer_scalar_t);
0262 auto f = [&](cl::sycl::handler &cgh) {
0263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
0264 buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
0265 auto non_deleter = [](buffer_scalar_t const *) {};
0266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
0267 cgh.copy(s_ptr, dst_acc);
0268 };
0269 cl::sycl::event e;
0270 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0271 synchronize_and_callback(e, callback);
0272 }
0273
0274
0275
0276
0277
0278 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
0279 void *dst, const void *src, size_t n,
0280 std::function<void()> callback) const {
0281 static const auto read_mode = cl::sycl::access::mode::read;
0282 static const auto global_access = cl::sycl::access::target::global_buffer;
0283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
0284 read_accessor;
0285 if (n == 0) {
0286 if (callback) callback();
0287 return;
0288 }
0289 n /= sizeof(buffer_scalar_t);
0290 auto f = [&](cl::sycl::handler &cgh) {
0291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
0292 buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
0293 auto non_deleter = [](buffer_scalar_t *) {};
0294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
0295 cgh.copy(src_acc, s_ptr);
0296 };
0297 cl::sycl::event e;
0298 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0299 synchronize_and_callback(e, callback);
0300 }
0301
0302
0303
0304
0305 EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
0306 static const auto read_mode = cl::sycl::access::mode::read;
0307 static const auto write_mode = cl::sycl::access::mode::discard_write;
0308 if (n == 0) {
0309 return;
0310 }
0311 n /= sizeof(buffer_scalar_t);
0312 auto f = [&](cl::sycl::handler &cgh) {
0313 auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
0314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
0315 cgh.copy(src_acc, dst_acc);
0316 };
0317 cl::sycl::event e;
0318 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0319 async_synchronize(e);
0320 }
0321
0322
0323
0324
0325 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
0326 static const auto write_mode = cl::sycl::access::mode::discard_write;
0327 if (n == 0) {
0328 return;
0329 }
0330 n /= sizeof(buffer_scalar_t);
0331 auto f = [&](cl::sycl::handler &cgh) {
0332 auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
0333
0334
0335
0336 cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
0337 };
0338 cl::sycl::event e;
0339 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0340 async_synchronize(e);
0341 }
0342
0343
0344
0345
0346
0347
0348
0349
0350 template <cl::sycl::access::mode AcMd, typename T>
0351 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
0352 get_range_accessor(const void *ptr) const {
0353 static const auto global_access = cl::sycl::access::target::global_buffer;
0354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
0355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
0356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
0357
0358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0359
0360 auto original_buffer = pMapper.get_buffer(ptr);
0361 const ptrdiff_t offset = pMapper.get_offset(ptr);
0362 const ptrdiff_t typed_offset = offset / sizeof(T);
0363 eigen_assert(typed_offset >= 0);
0364 const auto typed_size = original_buffer.get_size() / sizeof(T);
0365 auto buffer = original_buffer.template reinterpret<
0366 typename Eigen::internal::remove_const<T>::type>(
0367 cl::sycl::range<1>(typed_size));
0368 const ptrdiff_t size = buffer.get_count() - typed_offset;
0369 eigen_assert(size >= 0);
0370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
0371 1, AcMd, global_access, is_place_holder>
0372 placeholder_accessor_t;
0373 const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
0374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
0375 cl::sycl::id<1>(typed_offset)),
0376 static_cast<size_t>(typed_offset),
0377 reinterpret_cast<std::intptr_t>(start_ptr));
0378 }
0379
0380
0381
0382 template <cl::sycl::access::mode AcMd, typename Index>
0383 EIGEN_STRONG_INLINE cl::sycl::accessor<
0384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0385 get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
0386 const Index n_bytes) const {
0387 static const auto global_access = cl::sycl::access::target::global_buffer;
0388 eigen_assert(n_bytes >= 0);
0389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0390 auto buffer = pMapper.get_buffer(ptr);
0391 const ptrdiff_t offset = pMapper.get_offset(ptr);
0392 eigen_assert(offset >= 0);
0393 eigen_assert(offset + n_bytes <= buffer.get_size());
0394 return buffer.template get_access<AcMd, global_access>(
0395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
0396 }
0397
0398
0399
0400
0401
0402 template <cl::sycl::access::mode AcMd>
0403 EIGEN_STRONG_INLINE cl::sycl::accessor<
0404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0405 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
0406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0407 return pMapper.get_buffer(ptr)
0408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
0409 cgh);
0410 }
0411
0412 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
0413 const void *ptr) const {
0414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0415 return pMapper.get_buffer(ptr);
0416 }
0417
0418 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
0419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
0420 return pMapper.get_offset(ptr);
0421 }
0422
0423 template <typename OutScalar, typename sycl_kernel, typename Lhs,
0424 typename Rhs, typename OutPtr, typename Range, typename Index,
0425 typename... T>
0426 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
0427 const Rhs &rhs, OutPtr outptr,
0428 Range thread_range,
0429 Index scratchSize,
0430 T... var) const {
0431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
0432
0433 lhs.bind(cgh);
0434 rhs.bind(cgh);
0435 outptr.bind(cgh);
0436 typedef cl::sycl::accessor<OutScalar, 1,
0437 cl::sycl::access::mode::read_write,
0438 cl::sycl::access::target::local>
0439 LocalAccessor;
0440
0441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0442 cgh.parallel_for(
0443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0444 program().template get_kernel<sycl_kernel>(),
0445 #endif
0446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
0447 };
0448 cl::sycl::event e;
0449 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0450 async_synchronize(e);
0451 }
0452
0453 template <typename OutScalar, typename sycl_kernel, typename InPtr,
0454 typename OutPtr, typename Range, typename Index, typename... T>
0455 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
0456 OutPtr &outptr,
0457 Range thread_range,
0458 Index scratchSize,
0459 T... var) const {
0460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
0461
0462 inptr.bind(cgh);
0463 outptr.bind(cgh);
0464 typedef cl::sycl::accessor<OutScalar, 1,
0465 cl::sycl::access::mode::read_write,
0466 cl::sycl::access::target::local>
0467 LocalAccessor;
0468
0469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0470 cgh.parallel_for(
0471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0472 program().template get_kernel<sycl_kernel>(),
0473 #endif
0474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
0475 };
0476 cl::sycl::event e;
0477 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0478 async_synchronize(e);
0479 }
0480
0481 template <typename OutScalar, typename sycl_kernel, typename InPtr,
0482 typename Range, typename Index, typename... T>
0483 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
0484 Range thread_range,
0485 Index scratchSize,
0486 T... var) const {
0487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
0488
0489 inptr.bind(cgh);
0490 typedef cl::sycl::accessor<OutScalar, 1,
0491 cl::sycl::access::mode::read_write,
0492 cl::sycl::access::target::local>
0493 LocalAccessor;
0494
0495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0496 cgh.parallel_for(
0497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0498 program().template get_kernel<sycl_kernel>(),
0499 #endif
0500 thread_range, sycl_kernel(scratch, inptr, var...));
0501 };
0502 cl::sycl::event e;
0503 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0504 async_synchronize(e);
0505 }
0506
0507
0508 EIGEN_STRONG_INLINE void synchronize() const {
0509 #ifdef EIGEN_EXCEPTIONS
0510 m_queue.wait_and_throw();
0511 #else
0512 m_queue.wait();
0513 #endif
0514 }
0515
0516
0517 EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
0518 set_latest_event(e);
0519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
0520 synchronize();
0521 #endif
0522 }
0523
0524 template <typename Index>
0525 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
0526 Index &rng, Index &GRange) const {
0527 tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0528 tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
0530 static_cast<Index>(tileSize));
0531 rng = n;
0532 if (rng == 0) rng = static_cast<Index>(1);
0533 GRange = rng;
0534 if (tileSize > GRange)
0535 tileSize = GRange;
0536 else if (GRange > tileSize) {
0537 Index xMode = static_cast<Index>(GRange % tileSize);
0538 if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
0539 }
0540 }
0541
0542
0543
0544 template <typename Index>
0545 EIGEN_STRONG_INLINE void parallel_for_setup(
0546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
0547 cl::sycl::range<2> &local_range) const {
0548 std::array<Index, 2> input_range = input_dim;
0549 Index max_workgroup_Size =
0550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0551 max_workgroup_Size =
0552 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
0554 static_cast<Index>(max_workgroup_Size));
0555 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
0556 local_range[1] =
0557 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
0558 input_range[1] = input_dim[1];
0559 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
0560 global_range[1] = input_range[1];
0561 if (local_range[1] > global_range[1])
0562 local_range[1] = global_range[1];
0563 else if (global_range[1] > local_range[1]) {
0564 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
0565 if (xMode != 0)
0566 global_range[1] += static_cast<Index>(local_range[1] - xMode);
0567 }
0568 local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
0569 input_range[0] = input_dim[0];
0570 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
0571 global_range[0] = input_range[0];
0572 if (local_range[0] > global_range[0])
0573 local_range[0] = global_range[0];
0574 else if (global_range[0] > local_range[0]) {
0575 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
0576 if (xMode != 0)
0577 global_range[0] += static_cast<Index>(local_range[0] - xMode);
0578 }
0579 }
0580
0581
0582
0583 template <typename Index>
0584 EIGEN_STRONG_INLINE void parallel_for_setup(
0585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
0586 cl::sycl::range<3> &local_range) const {
0587 std::array<Index, 3> input_range = input_dim;
0588 Index max_workgroup_Size =
0589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0590 max_workgroup_Size =
0591 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
0593 static_cast<Index>(max_workgroup_Size));
0594 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
0595 local_range[2] =
0596 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
0597 input_range[2] = input_dim[2];
0598 if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
0599 global_range[2] = input_range[2];
0600 if (local_range[2] > global_range[2])
0601 local_range[2] = global_range[2];
0602 else if (global_range[2] > local_range[2]) {
0603 Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
0604 if (xMode != 0)
0605 global_range[2] += static_cast<Index>(local_range[2] - xMode);
0606 }
0607 pow_of_2 = static_cast<Index>(
0608 std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
0609 local_range[1] =
0610 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
0611 input_range[1] = input_dim[1];
0612 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
0613 global_range[1] = input_range[1];
0614 if (local_range[1] > global_range[1])
0615 local_range[1] = global_range[1];
0616 else if (global_range[1] > local_range[1]) {
0617 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
0618 if (xMode != 0)
0619 global_range[1] += static_cast<Index>(local_range[1] - xMode);
0620 }
0621 local_range[0] = static_cast<Index>(max_workgroup_Size /
0622 (local_range[1] * local_range[2]));
0623 input_range[0] = input_dim[0];
0624 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
0625 global_range[0] = input_range[0];
0626 if (local_range[0] > global_range[0])
0627 local_range[0] = global_range[0];
0628 else if (global_range[0] > local_range[0]) {
0629 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
0630 if (xMode != 0)
0631 global_range[0] += static_cast<Index>(local_range[0] - xMode);
0632 }
0633 }
0634
0635 EIGEN_STRONG_INLINE bool has_local_memory() const {
0636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
0637 return false;
0638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
0639 return true;
0640 #else
0641 return m_device_info.local_mem_type ==
0642 cl::sycl::info::local_mem_type::local;
0643 #endif
0644 }
0645
0646 EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
0647 return m_device_info.max_mem_alloc_size;
0648 }
0649
0650 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
0651 return m_device_info.max_compute_units;
0652 }
0653
0654 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
0655 return m_device_info.max_work_group_size;
0656 }
0657
0658 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
0659 return m_device_info.max_work_item_sizes;
0660 }
0661
0662
0663 EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
0664
0665 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
0666
0667 return 2;
0668 }
0669
0670 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
0671 return m_device_info.local_mem_size;
0672 }
0673
0674
0675
0676 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
0677 return getPowerOfTwo(m_device_info.max_work_group_size, false);
0678 }
0679
0680 EIGEN_STRONG_INLINE std::string getPlatformName() const {
0681 return m_device_info.platform_name;
0682 }
0683
0684 EIGEN_STRONG_INLINE std::string getDeviceName() const {
0685 return m_device_info.device_name;
0686 }
0687
0688 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
0689 return m_device_info.device_vendor;
0690 }
0691
0692
0693
0694
0695 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
0696 if (roundUp) --wGSize;
0697 wGSize |= (wGSize >> 1);
0698 wGSize |= (wGSize >> 2);
0699 wGSize |= (wGSize >> 4);
0700 wGSize |= (wGSize >> 8);
0701 wGSize |= (wGSize >> 16);
0702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
0703 wGSize |= (wGSize >> 32);
0704 #endif
0705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
0706 }
0707
0708 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
0709
0710
0711
0712 EIGEN_STRONG_INLINE bool ok() const {
0713 if (!exception_caught_) {
0714 synchronize();
0715 }
0716 return !exception_caught_;
0717 }
0718
0719 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
0720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0721 std::lock_guard<std::mutex> lock(event_mutex_);
0722 return latest_events_[std::this_thread::get_id()];
0723 #else
0724 eigen_assert(false);
0725 return cl::sycl::event();
0726 #endif
0727 }
0728
0729
0730 ~QueueInterface() {
0731 pMapper.clear();
0732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0733 scratch_buffers.clear();
0734 #endif
0735 }
0736
0737 protected:
0738 EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
0739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0740 std::lock_guard<std::mutex> lock(event_mutex_);
0741 latest_events_[std::this_thread::get_id()] = e;
0742 #else
0743 EIGEN_UNUSED_VARIABLE(e);
0744 #endif
0745 }
0746
0747 void synchronize_and_callback(cl::sycl::event e,
0748 const std::function<void()> &callback) const {
0749 set_latest_event(e);
0750 if (callback) {
0751 auto callback_ = [=]() {
0752 #ifdef EIGEN_EXCEPTIONS
0753 cl::sycl::event(e).wait_and_throw();
0754 #else
0755 cl::sycl::event(e).wait();
0756 #endif
0757 callback();
0758 };
0759 m_thread_pool.Schedule(std::move(callback_));
0760 } else {
0761 #ifdef EIGEN_EXCEPTIONS
0762 m_queue.wait_and_throw();
0763 #else
0764 m_queue.wait();
0765 #endif
0766 }
0767 }
0768
0769 bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
0770 bool exception_caught = false;
0771 for (const auto &e : exceptions) {
0772 if (e) {
0773 exception_caught = true;
0774 EIGEN_THROW_X(e);
0775 }
0776 }
0777 return exception_caught;
0778 }
0779
0780
0781 bool exception_caught_ = false;
0782
0783 mutable std::mutex pmapper_mutex_;
0784
0785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0786 mutable std::mutex event_mutex_;
0787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
0788 #endif
0789
0790
0791
0792
0793
0794 mutable TensorSycl::internal::PointerMapper pMapper;
0795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0796 mutable std::unordered_set<void *> scratch_buffers;
0797 #endif
0798
0799 mutable cl::sycl::queue m_queue;
0800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0801 mutable cl::sycl::program m_prog;
0802 #endif
0803
0804
0805
0806 mutable Eigen::ThreadPool m_thread_pool;
0807
0808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
0809 };
0810
0811 struct SyclDeviceBase {
0812
0813
0814 const QueueInterface *m_queue_stream;
0815 explicit SyclDeviceBase(const QueueInterface *queue_stream)
0816 : m_queue_stream(queue_stream) {}
0817 EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
0818 return m_queue_stream;
0819 }
0820 };
0821
0822
0823
0824 struct SyclDevice : public SyclDeviceBase {
0825 explicit SyclDevice(const QueueInterface *queue_stream)
0826 : SyclDeviceBase(queue_stream) {}
0827
0828
0829 template <cl::sycl::access::mode AcMd, typename T>
0830 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
0831 get_range_accessor(const void *ptr) const {
0832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
0833 }
0834
0835
0836 template <cl::sycl::access::mode AcMd>
0837 EIGEN_STRONG_INLINE cl::sycl::accessor<
0838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0839 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
0840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
0841 }
0842
0843
0844 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
0845 const void *ptr) const {
0846 return queue_stream()->get_sycl_buffer(ptr);
0847 }
0848
0849
0850
0851 template <typename Index>
0852 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
0853 Index &rng, Index &GRange) const {
0854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
0855 }
0856
0857
0858
0859 template <typename Index>
0860 EIGEN_STRONG_INLINE void parallel_for_setup(
0861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
0862 cl::sycl::range<2> &local_range) const {
0863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
0864 }
0865
0866
0867
0868 template <typename Index>
0869 EIGEN_STRONG_INLINE void parallel_for_setup(
0870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
0871 cl::sycl::range<3> &local_range) const {
0872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
0873 }
0874
0875
0876 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
0877 return queue_stream()->allocate(num_bytes);
0878 }
0879
0880 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
0881 return queue_stream()->allocate_temp(num_bytes);
0882 }
0883
0884
0885 EIGEN_STRONG_INLINE void deallocate(void *p) const {
0886 queue_stream()->deallocate(p);
0887 }
0888
0889 EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
0890 queue_stream()->deallocate_temp(buffer);
0891 }
0892 template <cl::sycl::access::mode AcMd, typename T>
0893 EIGEN_STRONG_INLINE void deallocate_temp(
0894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
0895 queue_stream()->deallocate_temp(buffer);
0896 }
0897 EIGEN_STRONG_INLINE void deallocate_all() const {
0898 queue_stream()->deallocate_all();
0899 }
0900
0901 template <typename data_t>
0902 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
0903 cl::sycl::access::mode::read_write, data_t>
0904 get(data_t *data) const {
0905 return queue_stream()->get(data);
0906 }
0907 template <typename data_t>
0908 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
0909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
0910 data_t>
0911 data) const {
0912 return queue_stream()->get(data);
0913 }
0914
0915
0916 EIGEN_STRONG_INLINE void *attach_buffer(
0917 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
0918 return queue_stream()->attach_buffer(buf);
0919 }
0920
0921 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
0922 queue_stream()->detach_buffer(p);
0923 }
0924 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
0925 return queue_stream()->get_offset(ptr);
0926 }
0927
0928
0929 EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
0930
0931
0932 template <typename Index>
0933 EIGEN_STRONG_INLINE void memcpyHostToDevice(
0934 Index *dst, const Index *src, size_t n,
0935 std::function<void()> callback = {}) const {
0936 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
0937 }
0938
0939 template <typename Index>
0940 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
0941 void *dst, const Index *src, size_t n,
0942 std::function<void()> callback = {}) const {
0943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
0944 }
0945
0946 template <typename Index>
0947 EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
0948 queue_stream()->memcpy(dst, src, n);
0949 }
0950
0951 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
0952 queue_stream()->memset(data, c, n);
0953 }
0954
0955 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
0956 return queue_stream()->sycl_queue();
0957 }
0958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0959 EIGEN_STRONG_INLINE cl::sycl::program &program() const {
0960 return queue_stream()->program();
0961 }
0962 #endif
0963
0964 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
0965
0966 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
0967
0968
0969 return firstLevelCacheSize();
0970 }
0971 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
0972 return queue_stream()->getNumSyclMultiProcessors();
0973 }
0974 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
0975 return queue_stream()->maxSyclThreadsPerBlock();
0976 }
0977 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
0978 return queue_stream()->maxWorkItemSizes();
0979 }
0980 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
0981
0982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
0983 }
0984 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
0985 return queue_stream()->sharedMemPerBlock();
0986 }
0987 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
0988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
0989 }
0990
0991 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
0992 return queue_stream()->getPowerOfTwo(val, roundUp);
0993 }
0994
0995 EIGEN_STRONG_INLINE int majorDeviceVersion() const {
0996 return queue_stream()->majorDeviceVersion();
0997 }
0998
0999 EIGEN_STRONG_INLINE void synchronize() const {
1000 queue_stream()->synchronize();
1001 }
1002 EIGEN_STRONG_INLINE void async_synchronize(
1003 cl::sycl::event e = cl::sycl::event()) const {
1004 queue_stream()->async_synchronize(e);
1005 }
1006 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007 return queue_stream()->get_latest_event();
1008 }
1009
1010
1011
1012 EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013
1014 EIGEN_STRONG_INLINE bool has_local_memory() const {
1015 return queue_stream()->has_local_memory();
1016 }
1017 EIGEN_STRONG_INLINE long max_buffer_size() const {
1018 return queue_stream()->max_buffer_size();
1019 }
1020 EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021 return queue_stream()->getPlatformName();
1022 }
1023 EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024 return queue_stream()->getDeviceName();
1025 }
1026 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027 return queue_stream()->getDeviceVendor();
1028 }
1029 template <typename OutScalar, typename KernelType, typename... T>
1030 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032 var...);
1033 }
1034 template <typename OutScalar, typename KernelType, typename... T>
1035 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037 var...);
1038 }
1039
1040 template <typename OutScalar, typename KernelType, typename... T>
1041 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043 var...);
1044 }
1045 };
1046 }
1047
1048 #endif