src/Tensor/TensorDeviceSycl.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Mehdi Goli    Codeplay Software Ltd.
0005 // Ralph Potter  Codeplay Software Ltd.
0006 // Luke Iwanski  Codeplay Software Ltd.
0007 // Contact: <eigen@codeplay.com>
0008 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
0009
0010 //
0011 // This Source Code Form is subject to the terms of the Mozilla
0012 // Public License v. 2.0. If a copy of the MPL was not distributed
0013 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0014
0015 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
0016 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
0017 #include <unordered_set>
0018
0019 namespace Eigen {
0020
0021 namespace TensorSycl {
0022 namespace internal {
0023
0024 /// Cache all the device information needed
0025 struct SyclDeviceInfo {
0026   SyclDeviceInfo(cl::sycl::queue queue)
0027       : local_mem_type(
0028             queue.get_device()
0029                 .template get_info<cl::sycl::info::device::local_mem_type>()),
0030         max_work_item_sizes(
0031             queue.get_device()
0032                 .template get_info<
0033                     cl::sycl::info::device::max_work_item_sizes>()),
0034         max_mem_alloc_size(
0035             queue.get_device()
0036                 .template get_info<
0037                     cl::sycl::info::device::max_mem_alloc_size>()),
0038         max_compute_units(queue.get_device()
0039                               .template get_info<
0040                                   cl::sycl::info::device::max_compute_units>()),
0041         max_work_group_size(
0042             queue.get_device()
0043                 .template get_info<
0044                     cl::sycl::info::device::max_work_group_size>()),
0045         local_mem_size(
0046             queue.get_device()
0047                 .template get_info<cl::sycl::info::device::local_mem_size>()),
0048         platform_name(queue.get_device()
0049                           .get_platform()
0050                           .template get_info<cl::sycl::info::platform::name>()),
0051         device_name(queue.get_device()
0052                         .template get_info<cl::sycl::info::device::name>()),
0053         device_vendor(
0054             queue.get_device()
0055                 .template get_info<cl::sycl::info::device::vendor>()) {}
0056
0057   cl::sycl::info::local_mem_type local_mem_type;
0058   cl::sycl::id<3> max_work_item_sizes;
0059   unsigned long max_mem_alloc_size;
0060   unsigned long max_compute_units;
0061   unsigned long max_work_group_size;
0062   size_t local_mem_size;
0063   std::string platform_name;
0064   std::string device_name;
0065   std::string device_vendor;
0066 };
0067
0068 }  // end namespace internal
0069 }  // end namespace TensorSycl
0070
0071 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
0072 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
0073 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
0074 // TensorFlow via the Eigen SYCL Backend.
0075 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
0076     -> decltype(cl::sycl::device::get_devices()) {
0077 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
0078   return {cl::sycl::device(cl::sycl::default_selector())};
0079 #else
0080   std::vector<cl::sycl::device> supported_devices;
0081   auto platform_list = cl::sycl::platform::get_platforms();
0082   for (const auto &platform : platform_list) {
0083     auto device_list = platform.get_devices();
0084     auto platform_name =
0085         platform.template get_info<cl::sycl::info::platform::name>();
0086     std::transform(platform_name.begin(), platform_name.end(),
0087                    platform_name.begin(), ::tolower);
0088     for (const auto &device : device_list) {
0089       auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
0090       std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
0091       bool unsupported_condition =
0092           (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
0093            vendor.find("apu") == std::string::npos) ||
0094           (platform_name.find("experimental") != std::string::npos) ||
0095           device.is_host();
0096       if (!unsupported_condition) {
0097         supported_devices.push_back(device);
0098       }
0099     }
0100   }
0101   return supported_devices;
0102 #endif
0103 }
0104
0105 class QueueInterface {
0106  public:
0107   /// Creating device by using cl::sycl::selector or cl::sycl::device.
0108   template <typename DeviceOrSelector>
0109   explicit QueueInterface(
0110       const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
0111       unsigned num_threads = std::thread::hardware_concurrency())
0112       : m_queue(dev_or_sel, handler),
0113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0114         m_prog(m_queue.get_context(), get_sycl_supported_devices()),
0115 #endif
0116         m_thread_pool(num_threads),
0117         m_device_info(m_queue) {
0118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0119     m_prog.build_with_kernel_type<DeviceOrSelector>();
0120     auto f = [&](cl::sycl::handler &cgh) {
0121       cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
0122                                         [=]() {})
0123     };
0124     EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
0125 #endif
0126   }
0127
0128   template <typename DeviceOrSelector>
0129   explicit QueueInterface(
0130       const DeviceOrSelector &dev_or_sel,
0131       unsigned num_threads = std::thread::hardware_concurrency())
0132       : QueueInterface(dev_or_sel,
0133                        [this](cl::sycl::exception_list l) {
0134                          this->exception_caught_ = this->sycl_async_handler(l);
0135                        },
0136                        num_threads) {}
0137
0138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0139   EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
0140 #endif
0141
0142   /// Attach an existing buffer to the pointer map, Eigen will not reuse it
0143   EIGEN_STRONG_INLINE void *attach_buffer(
0144       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
0145     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0146     return static_cast<void *>(pMapper.add_pointer(buf));
0147   }
0148
0149   /// Detach previously attached buffer
0150   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
0151     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0152     TensorSycl::internal::SYCLfree<false>(p, pMapper);
0153   }
0154
0155   /// Allocating device pointer. This pointer is actually an 8 bytes host
0156   /// pointer used as key to access the sycl device buffer. The reason is that
0157   /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
0158   /// expressions. So we create a key pointer to be used in Eigen expression
0159   /// construction. When we convert the Eigen construction into the sycl
0160   /// construction we use this pointer as a key in our buffer_map and we make
0161   /// sure that we dedicate only one buffer only for this pointer. The device
0162   /// pointer would be deleted by calling deallocate function.
0163   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
0164 #if EIGEN_MAX_ALIGN_BYTES > 0
0165     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
0166     if (align > 0) {
0167       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
0168     }
0169 #endif
0170     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0171     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0172   }
0173
0174   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
0175 #if EIGEN_MAX_ALIGN_BYTES > 0
0176     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
0177     if (align > 0) {
0178       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
0179     }
0180 #endif
0181     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0183     if (scratch_buffers.empty()) {
0184       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0185       ;
0186     } else {
0187       for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
0188         auto buff = pMapper.get_buffer(*it);
0189         if (buff.get_size() >= num_bytes) {
0190           auto ptr = *it;
0191           scratch_buffers.erase(it);
0192           return ptr;
0193         } else {
0194           ++it;
0195         }
0196       }
0197       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0198     }
0199 #else
0200     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
0201 #endif
0202   }
0203   template <typename data_t>
0204   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
0205       cl::sycl::access::mode::read_write, data_t>
0206   get(data_t *data) const {
0207     return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
0208   }
0209   template <typename data_t>
0210   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
0211       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
0212                                         data_t>
0213           data) const {
0214     return static_cast<data_t *>(data.get_virtual_pointer());
0215   }
0216
0217   EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
0218     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0220     scratch_buffers.insert(p);
0221 #else
0222     TensorSycl::internal::SYCLfree(p, pMapper);
0223 #endif
0224   }
0225   template <cl::sycl::access::mode AcMd, typename T>
0226   EIGEN_STRONG_INLINE void deallocate_temp(
0227       const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
0228     deallocate_temp(p.get_virtual_pointer());
0229   }
0230
0231   /// This is used to deallocate the device pointer. p is used as a key inside
0232   /// the map to find the device buffer and delete it.
0233   EIGEN_STRONG_INLINE void deallocate(void *p) const {
0234     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0235     TensorSycl::internal::SYCLfree(p, pMapper);
0236   }
0237
0238   EIGEN_STRONG_INLINE void deallocate_all() const {
0239     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0240     TensorSycl::internal::SYCLfreeAll(pMapper);
0241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0242     scratch_buffers.clear();
0243 #endif
0244   }
0245
0246   /// The memcpyHostToDevice is used to copy the data from host to device
0247   /// The destination pointer could be deleted before the copy happend which is
0248   /// why a callback function is needed. By default if none is provided, the
0249   /// function is blocking.
0250   EIGEN_STRONG_INLINE void memcpyHostToDevice(
0251       void *dst, const void *src, size_t n,
0252       std::function<void()> callback) const {
0253     static const auto write_mode = cl::sycl::access::mode::discard_write;
0254     static const auto global_access = cl::sycl::access::target::global_buffer;
0255     typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
0256         write_accessor;
0257     if (n == 0) {
0258       if (callback) callback();
0259       return;
0260     }
0261     n /= sizeof(buffer_scalar_t);
0262     auto f = [&](cl::sycl::handler &cgh) {
0263       write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
0264       buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
0265       auto non_deleter = [](buffer_scalar_t const *) {};
0266       std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
0267       cgh.copy(s_ptr, dst_acc);
0268     };
0269     cl::sycl::event e;
0270     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0271     synchronize_and_callback(e, callback);
0272   }
0273
0274   /// The memcpyDeviceToHost is used to copy the data from device to host.
0275   /// The source pointer could be deleted before the copy happend which is
0276   /// why a callback function is needed. By default if none is provided, the
0277   /// function is blocking.
0278   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
0279       void *dst, const void *src, size_t n,
0280       std::function<void()> callback) const {
0281     static const auto read_mode = cl::sycl::access::mode::read;
0282     static const auto global_access = cl::sycl::access::target::global_buffer;
0283     typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
0284         read_accessor;
0285     if (n == 0) {
0286       if (callback) callback();
0287       return;
0288     }
0289     n /= sizeof(buffer_scalar_t);
0290     auto f = [&](cl::sycl::handler &cgh) {
0291       read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
0292       buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
0293       auto non_deleter = [](buffer_scalar_t *) {};
0294       std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
0295       cgh.copy(src_acc, s_ptr);
0296     };
0297     cl::sycl::event e;
0298     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0299     synchronize_and_callback(e, callback);
0300   }
0301
0302   /// The memcpy function.
0303   /// No callback is required here as both arguments are on the device
0304   /// and SYCL can handle the dependency.
0305   EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
0306     static const auto read_mode = cl::sycl::access::mode::read;
0307     static const auto write_mode = cl::sycl::access::mode::discard_write;
0308     if (n == 0) {
0309       return;
0310     }
0311     n /= sizeof(buffer_scalar_t);
0312     auto f = [&](cl::sycl::handler &cgh) {
0313       auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
0314       auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
0315       cgh.copy(src_acc, dst_acc);
0316     };
0317     cl::sycl::event e;
0318     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0319     async_synchronize(e);
0320   }
0321
0322   /// the memset function.
0323   /// No callback is required here as both arguments are on the device
0324   /// and SYCL can handle the dependency.
0325   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
0326     static const auto write_mode = cl::sycl::access::mode::discard_write;
0327     if (n == 0) {
0328       return;
0329     }
0330     n /= sizeof(buffer_scalar_t);
0331     auto f = [&](cl::sycl::handler &cgh) {
0332       auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
0333       // The cast to uint8_t is here to match the behaviour of the standard
0334       // memset. The cast to buffer_scalar_t is needed to match the type of the
0335       // accessor (in case buffer_scalar_t is not uint8_t)
0336       cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
0337     };
0338     cl::sycl::event e;
0339     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
0340     async_synchronize(e);
0341   }
0342
0343   /// Get a range accessor to the virtual pointer's device memory. This range
0344   /// accessor will allow access to the memory from the pointer to the end of
0345   /// the buffer.
0346   ///
0347   /// NOTE: Inside a kernel the range accessor will always be indexed from the
0348   /// start of the buffer, so the offset in the accessor is only used by
0349   /// methods like handler::copy and will not be available inside a kernel.
0350   template <cl::sycl::access::mode AcMd, typename T>
0351   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
0352   get_range_accessor(const void *ptr) const {
0353     static const auto global_access = cl::sycl::access::target::global_buffer;
0354     static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
0355     typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
0356     typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
0357
0358     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0359
0360     auto original_buffer = pMapper.get_buffer(ptr);
0361     const ptrdiff_t offset = pMapper.get_offset(ptr);
0362     const ptrdiff_t typed_offset = offset / sizeof(T);
0363     eigen_assert(typed_offset >= 0);
0364     const auto typed_size = original_buffer.get_size() / sizeof(T);
0365     auto buffer = original_buffer.template reinterpret<
0366         typename Eigen::internal::remove_const<T>::type>(
0367         cl::sycl::range<1>(typed_size));
0368     const ptrdiff_t size = buffer.get_count() - typed_offset;
0369     eigen_assert(size >= 0);
0370     typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
0371                                1, AcMd, global_access, is_place_holder>
0372         placeholder_accessor_t;
0373     const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
0374     return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
0375                                            cl::sycl::id<1>(typed_offset)),
0376                     static_cast<size_t>(typed_offset),
0377                     reinterpret_cast<std::intptr_t>(start_ptr));
0378   }
0379
0380   /// Get a range accessor to the virtual pointer's device memory with a
0381   /// specified size.
0382   template <cl::sycl::access::mode AcMd, typename Index>
0383   EIGEN_STRONG_INLINE cl::sycl::accessor<
0384       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0385   get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
0386                      const Index n_bytes) const {
0387     static const auto global_access = cl::sycl::access::target::global_buffer;
0388     eigen_assert(n_bytes >= 0);
0389     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0390     auto buffer = pMapper.get_buffer(ptr);
0391     const ptrdiff_t offset = pMapper.get_offset(ptr);
0392     eigen_assert(offset >= 0);
0393     eigen_assert(offset + n_bytes <= buffer.get_size());
0394     return buffer.template get_access<AcMd, global_access>(
0395         cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
0396   }
0397
0398   /// Creation of sycl accessor for a buffer. This function first tries to find
0399   /// the buffer in the buffer_map. If found it gets the accessor from it, if
0400   /// not, the function then adds an entry by creating a sycl buffer for that
0401   /// particular pointer.
0402   template <cl::sycl::access::mode AcMd>
0403   EIGEN_STRONG_INLINE cl::sycl::accessor<
0404       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0405   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
0406     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0407     return pMapper.get_buffer(ptr)
0408         .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
0409             cgh);
0410   }
0411
0412   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
0413       const void *ptr) const {
0414     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0415     return pMapper.get_buffer(ptr);
0416   }
0417
0418   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
0419     std::lock_guard<std::mutex> lock(pmapper_mutex_);
0420     return pMapper.get_offset(ptr);
0421   }
0422
0423   template <typename OutScalar, typename sycl_kernel, typename Lhs,
0424             typename Rhs, typename OutPtr, typename Range, typename Index,
0425             typename... T>
0426   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
0427                                                   const Rhs &rhs, OutPtr outptr,
0428                                                   Range thread_range,
0429                                                   Index scratchSize,
0430                                                   T... var) const {
0431     auto kernel_functor = [=](cl::sycl::handler &cgh) {
0432       // binding the placeholder accessors to a commandgroup handler
0433       lhs.bind(cgh);
0434       rhs.bind(cgh);
0435       outptr.bind(cgh);
0436       typedef cl::sycl::accessor<OutScalar, 1,
0437                                  cl::sycl::access::mode::read_write,
0438                                  cl::sycl::access::target::local>
0439           LocalAccessor;
0440
0441       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0442       cgh.parallel_for(
0443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0444           program().template get_kernel<sycl_kernel>(),
0445 #endif
0446           thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
0447     };
0448     cl::sycl::event e;
0449     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0450     async_synchronize(e);
0451   }
0452
0453   template <typename OutScalar, typename sycl_kernel, typename InPtr,
0454             typename OutPtr, typename Range, typename Index, typename... T>
0455   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
0456                                                  OutPtr &outptr,
0457                                                  Range thread_range,
0458                                                  Index scratchSize,
0459                                                  T... var) const {
0460     auto kernel_functor = [=](cl::sycl::handler &cgh) {
0461       // binding the placeholder accessors to a commandgroup handler
0462       inptr.bind(cgh);
0463       outptr.bind(cgh);
0464       typedef cl::sycl::accessor<OutScalar, 1,
0465                                  cl::sycl::access::mode::read_write,
0466                                  cl::sycl::access::target::local>
0467           LocalAccessor;
0468
0469       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0470       cgh.parallel_for(
0471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0472           program().template get_kernel<sycl_kernel>(),
0473 #endif
0474           thread_range, sycl_kernel(scratch, inptr, outptr, var...));
0475     };
0476     cl::sycl::event e;
0477     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0478     async_synchronize(e);
0479   }
0480
0481     template <typename OutScalar, typename sycl_kernel, typename InPtr,
0482            typename Range, typename Index, typename... T>
0483   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
0484                                                  Range thread_range,
0485                                                  Index scratchSize,
0486                                                  T... var) const {
0487     auto kernel_functor = [=](cl::sycl::handler &cgh) {
0488       // binding the placeholder accessors to a commandgroup handler
0489       inptr.bind(cgh);
0490       typedef cl::sycl::accessor<OutScalar, 1,
0491                                  cl::sycl::access::mode::read_write,
0492                                  cl::sycl::access::target::local>
0493           LocalAccessor;
0494
0495       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
0496       cgh.parallel_for(
0497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0498           program().template get_kernel<sycl_kernel>(),
0499 #endif
0500           thread_range, sycl_kernel(scratch, inptr, var...));
0501     };
0502     cl::sycl::event e;
0503     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
0504     async_synchronize(e);
0505   }
0506
0507
0508   EIGEN_STRONG_INLINE void synchronize() const {
0509 #ifdef EIGEN_EXCEPTIONS
0510     m_queue.wait_and_throw();
0511 #else
0512     m_queue.wait();
0513 #endif
0514   }
0515
0516
0517   EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
0518     set_latest_event(e);
0519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
0520     synchronize();
0521 #endif
0522   }
0523
0524   template <typename Index>
0525   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
0526                                               Index &rng, Index &GRange) const {
0527     tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0528     tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0529                                            EIGEN_SYCL_LOCAL_THREAD_DIM1),
0530                         static_cast<Index>(tileSize));
0531     rng = n;
0532     if (rng == 0) rng = static_cast<Index>(1);
0533     GRange = rng;
0534     if (tileSize > GRange)
0535       tileSize = GRange;
0536     else if (GRange > tileSize) {
0537       Index xMode = static_cast<Index>(GRange % tileSize);
0538       if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
0539     }
0540   }
0541
0542   /// This is used to prepare the number of threads and also the number of
0543   /// threads per block for sycl kernels
0544   template <typename Index>
0545   EIGEN_STRONG_INLINE void parallel_for_setup(
0546       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
0547       cl::sycl::range<2> &local_range) const {
0548     std::array<Index, 2> input_range = input_dim;
0549     Index max_workgroup_Size =
0550         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0551     max_workgroup_Size =
0552         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0553                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
0554                  static_cast<Index>(max_workgroup_Size));
0555     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
0556     local_range[1] =
0557         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
0558     input_range[1] = input_dim[1];
0559     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
0560     global_range[1] = input_range[1];
0561     if (local_range[1] > global_range[1])
0562       local_range[1] = global_range[1];
0563     else if (global_range[1] > local_range[1]) {
0564       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
0565       if (xMode != 0)
0566         global_range[1] += static_cast<Index>(local_range[1] - xMode);
0567     }
0568     local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
0569     input_range[0] = input_dim[0];
0570     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
0571     global_range[0] = input_range[0];
0572     if (local_range[0] > global_range[0])
0573       local_range[0] = global_range[0];
0574     else if (global_range[0] > local_range[0]) {
0575       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
0576       if (xMode != 0)
0577         global_range[0] += static_cast<Index>(local_range[0] - xMode);
0578     }
0579   }
0580
0581   /// This is used to prepare the number of threads and also the number of
0582   /// threads per block for sycl kernels
0583   template <typename Index>
0584   EIGEN_STRONG_INLINE void parallel_for_setup(
0585       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
0586       cl::sycl::range<3> &local_range) const {
0587     std::array<Index, 3> input_range = input_dim;
0588     Index max_workgroup_Size =
0589         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
0590     max_workgroup_Size =
0591         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
0592                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
0593                  static_cast<Index>(max_workgroup_Size));
0594     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
0595     local_range[2] =
0596         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
0597     input_range[2] = input_dim[2];
0598     if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
0599     global_range[2] = input_range[2];
0600     if (local_range[2] > global_range[2])
0601       local_range[2] = global_range[2];
0602     else if (global_range[2] > local_range[2]) {
0603       Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
0604       if (xMode != 0)
0605         global_range[2] += static_cast<Index>(local_range[2] - xMode);
0606     }
0607     pow_of_2 = static_cast<Index>(
0608         std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
0609     local_range[1] =
0610         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
0611     input_range[1] = input_dim[1];
0612     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
0613     global_range[1] = input_range[1];
0614     if (local_range[1] > global_range[1])
0615       local_range[1] = global_range[1];
0616     else if (global_range[1] > local_range[1]) {
0617       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
0618       if (xMode != 0)
0619         global_range[1] += static_cast<Index>(local_range[1] - xMode);
0620     }
0621     local_range[0] = static_cast<Index>(max_workgroup_Size /
0622                                         (local_range[1] * local_range[2]));
0623     input_range[0] = input_dim[0];
0624     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
0625     global_range[0] = input_range[0];
0626     if (local_range[0] > global_range[0])
0627       local_range[0] = global_range[0];
0628     else if (global_range[0] > local_range[0]) {
0629       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
0630       if (xMode != 0)
0631         global_range[0] += static_cast<Index>(local_range[0] - xMode);
0632     }
0633   }
0634
0635   EIGEN_STRONG_INLINE bool has_local_memory() const {
0636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
0637     return false;
0638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
0639     return true;
0640 #else
0641     return m_device_info.local_mem_type ==
0642            cl::sycl::info::local_mem_type::local;
0643 #endif
0644   }
0645
0646   EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
0647     return m_device_info.max_mem_alloc_size;
0648   }
0649
0650   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
0651     return m_device_info.max_compute_units;
0652   }
0653
0654   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
0655     return m_device_info.max_work_group_size;
0656   }
0657
0658   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
0659     return m_device_info.max_work_item_sizes;
0660   }
0661
0662   /// No need for sycl it should act the same as CPU version
0663   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
0664
0665   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
0666     // OpenCL doesnot have such concept
0667     return 2;
0668   }
0669
0670   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
0671     return m_device_info.local_mem_size;
0672   }
0673
0674   // This function returns the nearest power of 2 Work-group size which is <=
0675   // maximum device workgroup size.
0676   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
0677     return getPowerOfTwo(m_device_info.max_work_group_size, false);
0678   }
0679
0680   EIGEN_STRONG_INLINE std::string getPlatformName() const {
0681     return m_device_info.platform_name;
0682   }
0683
0684   EIGEN_STRONG_INLINE std::string getDeviceName() const {
0685     return m_device_info.device_name;
0686   }
0687
0688   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
0689     return m_device_info.device_vendor;
0690   }
0691
0692   // This function returns the nearest power of 2
0693   // if roundup is true returns result>=wgsize
0694   // else it return result <= wgsize
0695   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
0696     if (roundUp) --wGSize;
0697     wGSize |= (wGSize >> 1);
0698     wGSize |= (wGSize >> 2);
0699     wGSize |= (wGSize >> 4);
0700     wGSize |= (wGSize >> 8);
0701     wGSize |= (wGSize >> 16);
0702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
0703     wGSize |= (wGSize >> 32);
0704 #endif
0705     return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
0706   }
0707
0708   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
0709
0710   // This function checks if the runtime recorded an error for the
0711   // underlying stream device.
0712   EIGEN_STRONG_INLINE bool ok() const {
0713     if (!exception_caught_) {
0714       synchronize();
0715     }
0716     return !exception_caught_;
0717   }
0718
0719   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
0720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0721     std::lock_guard<std::mutex> lock(event_mutex_);
0722     return latest_events_[std::this_thread::get_id()];
0723 #else
0724     eigen_assert(false);
0725     return cl::sycl::event();
0726 #endif
0727   }
0728
0729   // destructor
0730   ~QueueInterface() {
0731     pMapper.clear();
0732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0733     scratch_buffers.clear();
0734 #endif
0735   }
0736
0737  protected:
0738   EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
0739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0740     std::lock_guard<std::mutex> lock(event_mutex_);
0741     latest_events_[std::this_thread::get_id()] = e;
0742 #else
0743     EIGEN_UNUSED_VARIABLE(e);
0744 #endif
0745   }
0746
0747   void synchronize_and_callback(cl::sycl::event e,
0748                                 const std::function<void()> &callback) const {
0749     set_latest_event(e);
0750     if (callback) {
0751       auto callback_ = [=]() {
0752 #ifdef EIGEN_EXCEPTIONS
0753         cl::sycl::event(e).wait_and_throw();
0754 #else
0755         cl::sycl::event(e).wait();
0756 #endif
0757         callback();
0758       };
0759       m_thread_pool.Schedule(std::move(callback_));
0760     } else {
0761 #ifdef EIGEN_EXCEPTIONS
0762       m_queue.wait_and_throw();
0763 #else
0764       m_queue.wait();
0765 #endif
0766     }
0767   }
0768
0769   bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
0770     bool exception_caught = false;
0771     for (const auto &e : exceptions) {
0772       if (e) {
0773         exception_caught = true;
0774         EIGEN_THROW_X(e);
0775       }
0776     }
0777     return exception_caught;
0778   }
0779
0780   /// class members:
0781   bool exception_caught_ = false;
0782
0783   mutable std::mutex pmapper_mutex_;
0784
0785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
0786   mutable std::mutex event_mutex_;
0787   mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
0788 #endif
0789
0790   /// std::map is the container used to make sure that we create only one buffer
0791   /// per pointer. The lifespan of the buffer now depends on the lifespan of
0792   /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
0793   /// host we should manually deallocate it.
0794   mutable TensorSycl::internal::PointerMapper pMapper;
0795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
0796   mutable std::unordered_set<void *> scratch_buffers;
0797 #endif
0798   /// sycl queue
0799   mutable cl::sycl::queue m_queue;
0800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0801   mutable cl::sycl::program m_prog;
0802 #endif
0803
0804   /// The thread pool is used to wait on events and call callbacks
0805   /// asynchronously
0806   mutable Eigen::ThreadPool m_thread_pool;
0807
0808   const TensorSycl::internal::SyclDeviceInfo m_device_info;
0809 };
0810
0811 struct SyclDeviceBase {
0812   /// QueueInterface is not owned. it is the caller's responsibility to destroy
0813   /// it
0814   const QueueInterface *m_queue_stream;
0815   explicit SyclDeviceBase(const QueueInterface *queue_stream)
0816       : m_queue_stream(queue_stream) {}
0817   EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
0818     return m_queue_stream;
0819   }
0820 };
0821
0822 // Here is a sycl device struct which accept the sycl queue interface
0823 // as an input
0824 struct SyclDevice : public SyclDeviceBase {
0825   explicit SyclDevice(const QueueInterface *queue_stream)
0826       : SyclDeviceBase(queue_stream) {}
0827
0828   // this is the accessor used to construct the evaluator
0829   template <cl::sycl::access::mode AcMd, typename T>
0830   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
0831   get_range_accessor(const void *ptr) const {
0832     return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
0833   }
0834
0835   // get sycl accessor
0836   template <cl::sycl::access::mode AcMd>
0837   EIGEN_STRONG_INLINE cl::sycl::accessor<
0838       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
0839   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
0840     return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
0841   }
0842
0843   /// Accessing the created sycl device buffer for the device pointer
0844   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
0845       const void *ptr) const {
0846     return queue_stream()->get_sycl_buffer(ptr);
0847   }
0848
0849   /// This is used to prepare the number of threads and also the number of
0850   /// threads per block for sycl kernels
0851   template <typename Index>
0852   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
0853                                               Index &rng, Index &GRange) const {
0854     queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
0855   }
0856
0857   /// This is used to prepare the number of threads and also the number of
0858   /// threads per block for sycl kernels
0859   template <typename Index>
0860   EIGEN_STRONG_INLINE void parallel_for_setup(
0861       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
0862       cl::sycl::range<2> &local_range) const {
0863     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
0864   }
0865
0866   /// This is used to prepare the number of threads and also the number of
0867   /// threads per block for sycl kernels
0868   template <typename Index>
0869   EIGEN_STRONG_INLINE void parallel_for_setup(
0870       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
0871       cl::sycl::range<3> &local_range) const {
0872     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
0873   }
0874
0875   /// allocate device memory
0876   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
0877     return queue_stream()->allocate(num_bytes);
0878   }
0879
0880   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
0881     return queue_stream()->allocate_temp(num_bytes);
0882   }
0883
0884   /// deallocate device memory
0885   EIGEN_STRONG_INLINE void deallocate(void *p) const {
0886     queue_stream()->deallocate(p);
0887   }
0888
0889   EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
0890     queue_stream()->deallocate_temp(buffer);
0891   }
0892   template <cl::sycl::access::mode AcMd, typename T>
0893   EIGEN_STRONG_INLINE void deallocate_temp(
0894       const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
0895     queue_stream()->deallocate_temp(buffer);
0896   }
0897   EIGEN_STRONG_INLINE void deallocate_all() const {
0898     queue_stream()->deallocate_all();
0899   }
0900
0901   template <typename data_t>
0902   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
0903       cl::sycl::access::mode::read_write, data_t>
0904   get(data_t *data) const {
0905     return queue_stream()->get(data);
0906   }
0907   template <typename data_t>
0908   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
0909       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
0910                                         data_t>
0911           data) const {
0912     return queue_stream()->get(data);
0913   }
0914
0915   /// attach existing buffer
0916   EIGEN_STRONG_INLINE void *attach_buffer(
0917       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
0918     return queue_stream()->attach_buffer(buf);
0919   }
0920   /// detach buffer
0921   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
0922     queue_stream()->detach_buffer(p);
0923   }
0924   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
0925     return queue_stream()->get_offset(ptr);
0926   }
0927
0928   // some runtime conditions that can be applied here
0929   EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
0930
0931   /// memcpyHostToDevice
0932   template <typename Index>
0933   EIGEN_STRONG_INLINE void memcpyHostToDevice(
0934       Index *dst, const Index *src, size_t n,
0935       std::function<void()> callback = {}) const {
0936     queue_stream()->memcpyHostToDevice(dst, src, n, callback);
0937   }
0938   /// memcpyDeviceToHost
0939   template <typename Index>
0940   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
0941       void *dst, const Index *src, size_t n,
0942       std::function<void()> callback = {}) const {
0943     queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
0944   }
0945   /// the memcpy function
0946   template <typename Index>
0947   EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
0948     queue_stream()->memcpy(dst, src, n);
0949   }
0950   /// the memset function
0951   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
0952     queue_stream()->memset(data, c, n);
0953   }
0954   /// returning the sycl queue
0955   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
0956     return queue_stream()->sycl_queue();
0957   }
0958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
0959   EIGEN_STRONG_INLINE cl::sycl::program &program() const {
0960     return queue_stream()->program();
0961   }
0962 #endif
0963
0964   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
0965
0966   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
0967     // We won't try to take advantage of the l2 cache for the time being, and
0968     // there is no l3 cache on sycl devices.
0969     return firstLevelCacheSize();
0970   }
0971   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
0972     return queue_stream()->getNumSyclMultiProcessors();
0973   }
0974   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
0975     return queue_stream()->maxSyclThreadsPerBlock();
0976   }
0977   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
0978     return queue_stream()->maxWorkItemSizes();
0979   }
0980   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
0981     // OpenCL doesnot have such concept
0982     return queue_stream()->maxSyclThreadsPerMultiProcessor();
0983   }
0984   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
0985     return queue_stream()->sharedMemPerBlock();
0986   }
0987   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
0988     return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
0989   }
0990
0991   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
0992     return queue_stream()->getPowerOfTwo(val, roundUp);
0993   }
0994   /// No need for sycl it should act the same as CPU version
0995   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
0996     return queue_stream()->majorDeviceVersion();
0997   }
0998
0999   EIGEN_STRONG_INLINE void synchronize() const {
1000     queue_stream()->synchronize();
1001   }
1002   EIGEN_STRONG_INLINE void async_synchronize(
1003       cl::sycl::event e = cl::sycl::event()) const {
1004     queue_stream()->async_synchronize(e);
1005   }
1006   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007     return queue_stream()->get_latest_event();
1008   }
1009
1010   // This function checks if the runtime recorded an error for the
1011   // underlying stream device.
1012   EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013
1014   EIGEN_STRONG_INLINE bool has_local_memory() const {
1015     return queue_stream()->has_local_memory();
1016   }
1017   EIGEN_STRONG_INLINE long max_buffer_size() const {
1018     return queue_stream()->max_buffer_size();
1019   }
1020   EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021     return queue_stream()->getPlatformName();
1022   }
1023   EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024     return queue_stream()->getDeviceName();
1025   }
1026   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027     return queue_stream()->getDeviceVendor();
1028   }
1029   template <typename OutScalar, typename KernelType, typename... T>
1030   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031     queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032         var...);
1033   }
1034   template <typename OutScalar, typename KernelType, typename... T>
1035   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036     queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037         var...);
1038   }
1039
1040   template <typename OutScalar, typename KernelType, typename... T>
1041   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042     queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043         var...);
1044   }
1045 };
1046 }  // end namespace Eigen
1047
1048 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H