llvm/Support/BalancedPartitioning.h

0001 //===- BalancedPartitioning.h ---------------------------------------------===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 // This file implements BalancedPartitioning, a recursive balanced graph
0010 // partitioning algorithm.
0011 //
0012 // The algorithm is used to find an ordering of FunctionNodes while optimizing
0013 // a specified objective. The algorithm uses recursive bisection; it starts
0014 // with a collection of unordered FunctionNodes and tries to split them into
0015 // two sets (buckets) of equal cardinality. Each bisection step is comprised of
0016 // iterations that greedily swap the FunctionNodes between the two buckets while
0017 // there is an improvement of the objective. Once the process converges, the
0018 // problem is divided into two sub-problems of half the size, which are
0019 // recursively applied for the two buckets. The final ordering of the
0020 // FunctionNodes is obtained by concatenating the two (recursively computed)
0021 // orderings.
0022 //
0023 // In order to speed up the computation, we limit the depth of the recursive
0024 // tree by a specified constant (SplitDepth) and apply at most a constant
0025 // number of greedy iterations per split (IterationsPerSplit). The worst-case
0026 // time complexity of the implementation is bounded by O(M*log^2 N), where
0027 // N is the number of FunctionNodes and M is the number of
0028 // FunctionNode-UtilityNode edges; (assuming that any collection of D
0029 // FunctionNodes contains O(D) UtilityNodes). Notice that the two different
0030 // recursive sub-problems are independent and thus can be efficiently processed
0031 // in parallel.
0032 //
0033 // Reference:
0034 //   * Optimizing Function Layout for Mobile Applications,
0035 //     https://arxiv.org/abs/2211.09285
0036 //
0037 //===----------------------------------------------------------------------===//
0038
0039 #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
0040 #define LLVM_SUPPORT_BALANCED_PARTITIONING_H
0041
0042 #include "raw_ostream.h"
0043 #include "llvm/ADT/ArrayRef.h"
0044
0045 #include <atomic>
0046 #include <condition_variable>
0047 #include <mutex>
0048 #include <random>
0049 #include <vector>
0050
0051 namespace llvm {
0052
0053 class ThreadPoolInterface;
0054 /// A function with a set of utility nodes where it is beneficial to order two
0055 /// functions close together if they have similar utility nodes
0056 class BPFunctionNode {
0057   friend class BalancedPartitioning;
0058
0059 public:
0060   using IDT = uint64_t;
0061   using UtilityNodeT = uint32_t;
0062
0063   /// \param UtilityNodes the set of utility nodes (must be unique'd)
0064   BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
0065       : Id(Id), UtilityNodes(UtilityNodes) {}
0066
0067   /// The ID of this node
0068   IDT Id;
0069
0070   void dump(raw_ostream &OS) const;
0071
0072 protected:
0073   /// The list of utility nodes associated with this node
0074   SmallVector<UtilityNodeT, 4> UtilityNodes;
0075   /// The bucket assigned by balanced partitioning
0076   std::optional<unsigned> Bucket;
0077   /// The index of the input order of the FunctionNodes
0078   uint64_t InputOrderIndex = 0;
0079
0080   friend class BPFunctionNodeTest_Basic_Test;
0081   friend class BalancedPartitioningTest_Basic_Test;
0082   friend class BalancedPartitioningTest_Large_Test;
0083 };
0084
0085 /// Algorithm parameters; default values are tuned on real-world binaries
0086 struct BalancedPartitioningConfig {
0087   /// The depth of the recursive bisection
0088   unsigned SplitDepth = 18;
0089   /// The maximum number of bp iterations per split
0090   unsigned IterationsPerSplit = 40;
0091   /// The probability for a vertex to skip a move from its current bucket to
0092   /// another bucket; it often helps to escape from a local optima
0093   float SkipProbability = 0.1f;
0094   /// Recursive subtasks up to the given depth are added to the queue and
0095   /// distributed among threads by ThreadPool; all subsequent calls are executed
0096   /// on the same thread
0097   unsigned TaskSplitDepth = 9;
0098 };
0099
0100 class BalancedPartitioning {
0101 public:
0102   BalancedPartitioning(const BalancedPartitioningConfig &Config);
0103
0104   /// Run recursive graph partitioning that optimizes a given objective.
0105   void run(std::vector<BPFunctionNode> &Nodes) const;
0106
0107 private:
0108   struct UtilitySignature;
0109   using SignaturesT = SmallVector<UtilitySignature, 4>;
0110   using FunctionNodeRange =
0111       iterator_range<std::vector<BPFunctionNode>::iterator>;
0112
0113   /// A special ThreadPool that allows for spawning new tasks after blocking on
0114   /// wait(). BalancedPartitioning recursively spawns new threads inside other
0115   /// threads, so we need to track how many active threads that could spawn more
0116   /// threads.
0117   struct BPThreadPool {
0118     ThreadPoolInterface &TheThreadPool;
0119     std::mutex mtx;
0120     std::condition_variable cv;
0121     /// The number of threads that could spawn more threads
0122     std::atomic<int> NumActiveThreads = 0;
0123     /// Only true when all threads are down spawning new threads
0124     bool IsFinishedSpawning = false;
0125     /// Asynchronous submission of the task to the pool
0126     template <typename Func> void async(Func &&F);
0127     /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
0128     /// acceptable for other threads to add more tasks while blocking on this
0129     /// call.
0130     void wait();
0131     BPThreadPool(ThreadPoolInterface &TheThreadPool)
0132         : TheThreadPool(TheThreadPool) {}
0133   };
0134
0135   /// Run a recursive bisection of a given list of FunctionNodes
0136   /// \param RecDepth the current depth of recursion
0137   /// \param RootBucket the initial bucket of the dataVertices
0138   /// \param Offset the assigned buckets are the range [Offset, Offset +
0139   /// Nodes.size()]
0140   void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
0141               unsigned RootBucket, unsigned Offset,
0142               std::optional<BPThreadPool> &TP) const;
0143
0144   /// Run bisection iterations
0145   void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
0146                      unsigned RightBucket, std::mt19937 &RNG) const;
0147
0148   /// Run a bisection iteration to improve the optimization goal
0149   /// \returns the total number of moved FunctionNodes
0150   unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
0151                         unsigned RightBucket, SignaturesT &Signatures,
0152                         std::mt19937 &RNG) const;
0153
0154   /// Try to move \p N from one bucket to another
0155   /// \returns true iff \p N is moved
0156   bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
0157                         unsigned RightBucket, SignaturesT &Signatures,
0158                         std::mt19937 &RNG) const;
0159
0160   /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
0161   /// 1 The method is used for an initial assignment before a bisection step
0162   void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
0163
0164   /// The cost of the uniform log-gap cost, assuming a utility node has \p X
0165   /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
0166   float logCost(unsigned X, unsigned Y) const;
0167
0168   float log2Cached(unsigned i) const;
0169
0170   const BalancedPartitioningConfig &Config;
0171
0172   /// Precomputed values of log2(x). Table size is small enough to fit in cache.
0173   static constexpr unsigned LOG_CACHE_SIZE = 16384;
0174   float Log2Cache[LOG_CACHE_SIZE];
0175
0176   /// The signature of a particular utility node used for the bisection step,
0177   /// i.e., the number of \p FunctionNodes in each of the two buckets
0178   struct UtilitySignature {
0179     /// The number of \p FunctionNodes in the left bucket
0180     unsigned LeftCount = 0;
0181     /// The number of \p FunctionNodes in the right bucket
0182     unsigned RightCount = 0;
0183     /// The cached gain of moving a \p FunctionNode from the left bucket to the
0184     /// right bucket
0185     float CachedGainLR;
0186     /// The cached gain of moving a \p FunctionNode from the right bucket to the
0187     /// left bucket
0188     float CachedGainRL;
0189     /// Whether \p CachedGainLR and \p CachedGainRL are valid
0190     bool CachedGainIsValid = false;
0191   };
0192
0193 protected:
0194   /// Compute the move gain for uniform log-gap cost
0195   static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
0196                         const SignaturesT &Signatures);
0197   friend class BalancedPartitioningTest_MoveGain_Test;
0198 };
0199
0200 } // end namespace llvm
0201
0202 #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H