src/ThreadPool/ThreadLocal.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
0005 //
0006 // This Source Code Form is subject to the terms of the Mozilla
0007 // Public License v. 2.0. If a copy of the MPL was not distributed
0008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0009
0010 #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
0011 #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
0012
0013 #ifdef EIGEN_AVOID_THREAD_LOCAL
0014
0015 #ifdef EIGEN_THREAD_LOCAL
0016 #undef EIGEN_THREAD_LOCAL
0017 #endif
0018
0019 #else
0020
0021 #if EIGEN_MAX_CPP_VER >= 11 &&                         \
0022     ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
0023      __has_feature(cxx_thread_local)                || \
0024      (EIGEN_COMP_MSVC >= 1900) )
0025 #define EIGEN_THREAD_LOCAL static thread_local
0026 #endif
0027
0028 // Disable TLS for Apple and Android builds with older toolchains.
0029 #if defined(__APPLE__)
0030 // Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
0031 // __IPHONE_8_0.
0032 #include <Availability.h>
0033 #include <TargetConditionals.h>
0034 #endif
0035 // Checks whether C++11's `thread_local` storage duration specifier is
0036 // supported.
0037 #if defined(__apple_build_version__) &&     \
0038     ((__apple_build_version__ < 8000042) || \
0039      (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
0040 // Notes: Xcode's clang did not support `thread_local` until version
0041 // 8, and even then not for all iOS < 9.0.
0042 #undef EIGEN_THREAD_LOCAL
0043
0044 #elif defined(__ANDROID__) && EIGEN_COMP_CLANG
0045 // There are platforms for which TLS should not be used even though the compiler
0046 // makes it seem like it's supported (Android NDK < r12b for example).
0047 // This is primarily because of linker problems and toolchain misconfiguration:
0048 // TLS isn't supported until NDK r12b per
0049 // https://developer.android.com/ndk/downloads/revision_history.html
0050 // Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
0051 // <android/ndk-version.h>. For NDK < r16, users should define these macros,
0052 // e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
0053 #if __has_include(<android/ndk-version.h>)
0054 #include <android/ndk-version.h>
0055 #endif  // __has_include(<android/ndk-version.h>)
0056 #if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
0057     defined(__NDK_MINOR__) &&                                               \
0058     ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
0059 #undef EIGEN_THREAD_LOCAL
0060 #endif
0061 #endif  // defined(__ANDROID__) && defined(__clang__)
0062
0063 #endif  // EIGEN_AVOID_THREAD_LOCAL
0064
0065 namespace Eigen {
0066
0067 namespace internal {
0068 template <typename T>
0069 struct ThreadLocalNoOpInitialize {
0070   void operator()(T&) const {}
0071 };
0072
0073 template <typename T>
0074 struct ThreadLocalNoOpRelease {
0075   void operator()(T&) const {}
0076 };
0077
0078 }  // namespace internal
0079
0080 // Thread local container for elements of type T, that does not use thread local
0081 // storage. As long as the number of unique threads accessing this storage
0082 // is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
0083 // use a mutex for synchronization.
0084 //
0085 // Type `T` has to be default constructible, and by default each thread will get
0086 // a default constructed value. It is possible to specify custom `initialize`
0087 // callable, that will be called lazily from each thread accessing this object,
0088 // and will be passed a default initialized object of type `T`. Also it's
0089 // possible to pass a custom `release` callable, that will be invoked before
0090 // calling ~T().
0091 //
0092 // Example:
0093 //
0094 //   struct Counter {
0095 //     int value = 0;
0096 //   }
0097 //
0098 //   Eigen::ThreadLocal<Counter> counter(10);
0099 //
0100 //   // Each thread will have access to it's own counter object.
0101 //   Counter& cnt = counter.local();
0102 //   cnt++;
0103 //
0104 // WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
0105 // std::this_thread::get_id() to identify threads. This value is not guaranteed
0106 // to be unique except for the life of the thread. A newly created thread may
0107 // get an OS-specific ID equal to that of an already destroyed thread.
0108 //
0109 // Somewhat similar to TBB thread local storage, with similar restrictions:
0110 // https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
0111 //
0112 template <typename T,
0113           typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
0114           typename Release = internal::ThreadLocalNoOpRelease<T>>
0115 class ThreadLocal {
0116   // We preallocate default constructed elements in MaxSizedVector.
0117   static_assert(std::is_default_constructible<T>::value,
0118                 "ThreadLocal data type must be default constructible");
0119
0120  public:
0121   explicit ThreadLocal(int capacity)
0122       : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
0123                     internal::ThreadLocalNoOpRelease<T>()) {}
0124
0125   ThreadLocal(int capacity, Initialize initialize)
0126       : ThreadLocal(capacity, std::move(initialize),
0127                     internal::ThreadLocalNoOpRelease<T>()) {}
0128
0129   ThreadLocal(int capacity, Initialize initialize, Release release)
0130       : initialize_(std::move(initialize)),
0131         release_(std::move(release)),
0132         capacity_(capacity),
0133         data_(capacity_),
0134         ptr_(capacity_),
0135         filled_records_(0) {
0136     eigen_assert(capacity_ >= 0);
0137     data_.resize(capacity_);
0138     for (int i = 0; i < capacity_; ++i) {
0139       ptr_.emplace_back(nullptr);
0140     }
0141   }
0142
0143   T& local() {
0144     std::thread::id this_thread = std::this_thread::get_id();
0145     if (capacity_ == 0) return SpilledLocal(this_thread);
0146
0147     std::size_t h = std::hash<std::thread::id>()(this_thread);
0148     const int start_idx = h % capacity_;
0149
0150     // NOTE: From the definition of `std::this_thread::get_id()` it is
0151     // guaranteed that we never can have concurrent insertions with the same key
0152     // to our hash-map like data structure. If we didn't find an element during
0153     // the initial traversal, it's guaranteed that no one else could have
0154     // inserted it while we are in this function. This allows to massively
0155     // simplify out lock-free insert-only hash map.
0156
0157     // Check if we already have an element for `this_thread`.
0158     int idx = start_idx;
0159     while (ptr_[idx].load() != nullptr) {
0160       ThreadIdAndValue& record = *(ptr_[idx].load());
0161       if (record.thread_id == this_thread) return record.value;
0162
0163       idx += 1;
0164       if (idx >= capacity_) idx -= capacity_;
0165       if (idx == start_idx) break;
0166     }
0167
0168     // If we are here, it means that we found an insertion point in lookup
0169     // table at `idx`, or we did a full traversal and table is full.
0170
0171     // If lock-free storage is full, fallback on mutex.
0172     if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
0173
0174     // We double check that we still have space to insert an element into a lock
0175     // free storage. If old value in `filled_records_` is larger than the
0176     // records capacity, it means that some other thread added an element while
0177     // we were traversing lookup table.
0178     int insertion_index =
0179         filled_records_.fetch_add(1, std::memory_order_relaxed);
0180     if (insertion_index >= capacity_) return SpilledLocal(this_thread);
0181
0182     // At this point it's guaranteed that we can access to
0183     // data_[insertion_index_] without a data race.
0184     data_[insertion_index].thread_id = this_thread;
0185     initialize_(data_[insertion_index].value);
0186
0187     // That's the pointer we'll put into the lookup table.
0188     ThreadIdAndValue* inserted = &data_[insertion_index];
0189
0190     // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
0191     ThreadIdAndValue* empty = nullptr;
0192
0193     // Now we have to find an insertion point into the lookup table. We start
0194     // from the `idx` that was identified as an insertion point above, it's
0195     // guaranteed that we will have an empty record somewhere in a lookup table
0196     // (because we created a record in the `data_`).
0197     const int insertion_idx = idx;
0198
0199     do {
0200       // Always start search from the original insertion candidate.
0201       idx = insertion_idx;
0202       while (ptr_[idx].load() != nullptr) {
0203         idx += 1;
0204         if (idx >= capacity_) idx -= capacity_;
0205         // If we did a full loop, it means that we don't have any free entries
0206         // in the lookup table, and this means that something is terribly wrong.
0207         eigen_assert(idx != insertion_idx);
0208       }
0209       // Atomic CAS of the pointer guarantees that any other thread, that will
0210       // follow this pointer will see all the mutations in the `data_`.
0211     } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
0212
0213     return inserted->value;
0214   }
0215
0216   // WARN: It's not thread safe to call it concurrently with `local()`.
0217   void ForEach(std::function<void(std::thread::id, T&)> f) {
0218     // Reading directly from `data_` is unsafe, because only CAS to the
0219     // record in `ptr_` makes all changes visible to other threads.
0220     for (auto& ptr : ptr_) {
0221       ThreadIdAndValue* record = ptr.load();
0222       if (record == nullptr) continue;
0223       f(record->thread_id, record->value);
0224     }
0225
0226     // We did not spill into the map based storage.
0227     if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
0228
0229     // Adds a happens before edge from the last call to SpilledLocal().
0230     std::unique_lock<std::mutex> lock(mu_);
0231     for (auto& kv : per_thread_map_) {
0232       f(kv.first, kv.second);
0233     }
0234   }
0235
0236   // WARN: It's not thread safe to call it concurrently with `local()`.
0237   ~ThreadLocal() {
0238     // Reading directly from `data_` is unsafe, because only CAS to the record
0239     // in `ptr_` makes all changes visible to other threads.
0240     for (auto& ptr : ptr_) {
0241       ThreadIdAndValue* record = ptr.load();
0242       if (record == nullptr) continue;
0243       release_(record->value);
0244     }
0245
0246     // We did not spill into the map based storage.
0247     if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
0248
0249     // Adds a happens before edge from the last call to SpilledLocal().
0250     std::unique_lock<std::mutex> lock(mu_);
0251     for (auto& kv : per_thread_map_) {
0252       release_(kv.second);
0253     }
0254   }
0255
0256  private:
0257   struct ThreadIdAndValue {
0258     std::thread::id thread_id;
0259     T value;
0260   };
0261
0262   // Use unordered map guarded by a mutex when lock free storage is full.
0263   T& SpilledLocal(std::thread::id this_thread) {
0264     std::unique_lock<std::mutex> lock(mu_);
0265
0266     auto it = per_thread_map_.find(this_thread);
0267     if (it == per_thread_map_.end()) {
0268       auto result = per_thread_map_.emplace(this_thread, T());
0269       eigen_assert(result.second);
0270       initialize_((*result.first).second);
0271       return (*result.first).second;
0272     } else {
0273       return it->second;
0274     }
0275   }
0276
0277   Initialize initialize_;
0278   Release release_;
0279   const int capacity_;
0280
0281   // Storage that backs lock-free lookup table `ptr_`. Records stored in this
0282   // storage contiguously starting from index 0.
0283   MaxSizeVector<ThreadIdAndValue> data_;
0284
0285   // Atomic pointers to the data stored in `data_`. Used as a lookup table for
0286   // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
0287   MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
0288
0289   // Number of records stored in the `data_`.
0290   std::atomic<int> filled_records_;
0291
0292   // We fallback on per thread map if lock-free storage is full. In practice
0293   // this should never happen, if `capacity_` is a reasonable estimate of the
0294   // number of threads running in a system.
0295   std::mutex mu_;  // Protects per_thread_map_.
0296   std::unordered_map<std::thread::id, T> per_thread_map_;
0297 };
0298
0299 }  // namespace Eigen
0300
0301 #endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H