|
|
|||
File indexing completed on 2026-05-10 08:44:27
0001 //===- BalancedPartitioning.h ---------------------------------------------===// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 // 0009 // This file implements BalancedPartitioning, a recursive balanced graph 0010 // partitioning algorithm. 0011 // 0012 // The algorithm is used to find an ordering of FunctionNodes while optimizing 0013 // a specified objective. The algorithm uses recursive bisection; it starts 0014 // with a collection of unordered FunctionNodes and tries to split them into 0015 // two sets (buckets) of equal cardinality. Each bisection step is comprised of 0016 // iterations that greedily swap the FunctionNodes between the two buckets while 0017 // there is an improvement of the objective. Once the process converges, the 0018 // problem is divided into two sub-problems of half the size, which are 0019 // recursively applied for the two buckets. The final ordering of the 0020 // FunctionNodes is obtained by concatenating the two (recursively computed) 0021 // orderings. 0022 // 0023 // In order to speed up the computation, we limit the depth of the recursive 0024 // tree by a specified constant (SplitDepth) and apply at most a constant 0025 // number of greedy iterations per split (IterationsPerSplit). The worst-case 0026 // time complexity of the implementation is bounded by O(M*log^2 N), where 0027 // N is the number of FunctionNodes and M is the number of 0028 // FunctionNode-UtilityNode edges; (assuming that any collection of D 0029 // FunctionNodes contains O(D) UtilityNodes). Notice that the two different 0030 // recursive sub-problems are independent and thus can be efficiently processed 0031 // in parallel. 0032 // 0033 // Reference: 0034 // * Optimizing Function Layout for Mobile Applications, 0035 // https://arxiv.org/abs/2211.09285 0036 // 0037 //===----------------------------------------------------------------------===// 0038 0039 #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H 0040 #define LLVM_SUPPORT_BALANCED_PARTITIONING_H 0041 0042 #include "raw_ostream.h" 0043 #include "llvm/ADT/ArrayRef.h" 0044 0045 #include <atomic> 0046 #include <condition_variable> 0047 #include <mutex> 0048 #include <random> 0049 #include <vector> 0050 0051 namespace llvm { 0052 0053 class ThreadPoolInterface; 0054 /// A function with a set of utility nodes where it is beneficial to order two 0055 /// functions close together if they have similar utility nodes 0056 class BPFunctionNode { 0057 friend class BalancedPartitioning; 0058 0059 public: 0060 using IDT = uint64_t; 0061 using UtilityNodeT = uint32_t; 0062 0063 /// \param UtilityNodes the set of utility nodes (must be unique'd) 0064 BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes) 0065 : Id(Id), UtilityNodes(UtilityNodes) {} 0066 0067 /// The ID of this node 0068 IDT Id; 0069 0070 void dump(raw_ostream &OS) const; 0071 0072 protected: 0073 /// The list of utility nodes associated with this node 0074 SmallVector<UtilityNodeT, 4> UtilityNodes; 0075 /// The bucket assigned by balanced partitioning 0076 std::optional<unsigned> Bucket; 0077 /// The index of the input order of the FunctionNodes 0078 uint64_t InputOrderIndex = 0; 0079 0080 friend class BPFunctionNodeTest_Basic_Test; 0081 friend class BalancedPartitioningTest_Basic_Test; 0082 friend class BalancedPartitioningTest_Large_Test; 0083 }; 0084 0085 /// Algorithm parameters; default values are tuned on real-world binaries 0086 struct BalancedPartitioningConfig { 0087 /// The depth of the recursive bisection 0088 unsigned SplitDepth = 18; 0089 /// The maximum number of bp iterations per split 0090 unsigned IterationsPerSplit = 40; 0091 /// The probability for a vertex to skip a move from its current bucket to 0092 /// another bucket; it often helps to escape from a local optima 0093 float SkipProbability = 0.1f; 0094 /// Recursive subtasks up to the given depth are added to the queue and 0095 /// distributed among threads by ThreadPool; all subsequent calls are executed 0096 /// on the same thread 0097 unsigned TaskSplitDepth = 9; 0098 }; 0099 0100 class BalancedPartitioning { 0101 public: 0102 BalancedPartitioning(const BalancedPartitioningConfig &Config); 0103 0104 /// Run recursive graph partitioning that optimizes a given objective. 0105 void run(std::vector<BPFunctionNode> &Nodes) const; 0106 0107 private: 0108 struct UtilitySignature; 0109 using SignaturesT = SmallVector<UtilitySignature, 4>; 0110 using FunctionNodeRange = 0111 iterator_range<std::vector<BPFunctionNode>::iterator>; 0112 0113 /// A special ThreadPool that allows for spawning new tasks after blocking on 0114 /// wait(). BalancedPartitioning recursively spawns new threads inside other 0115 /// threads, so we need to track how many active threads that could spawn more 0116 /// threads. 0117 struct BPThreadPool { 0118 ThreadPoolInterface &TheThreadPool; 0119 std::mutex mtx; 0120 std::condition_variable cv; 0121 /// The number of threads that could spawn more threads 0122 std::atomic<int> NumActiveThreads = 0; 0123 /// Only true when all threads are down spawning new threads 0124 bool IsFinishedSpawning = false; 0125 /// Asynchronous submission of the task to the pool 0126 template <typename Func> void async(Func &&F); 0127 /// Blocking wait for all threads to complete. Unlike ThreadPool, it is 0128 /// acceptable for other threads to add more tasks while blocking on this 0129 /// call. 0130 void wait(); 0131 BPThreadPool(ThreadPoolInterface &TheThreadPool) 0132 : TheThreadPool(TheThreadPool) {} 0133 }; 0134 0135 /// Run a recursive bisection of a given list of FunctionNodes 0136 /// \param RecDepth the current depth of recursion 0137 /// \param RootBucket the initial bucket of the dataVertices 0138 /// \param Offset the assigned buckets are the range [Offset, Offset + 0139 /// Nodes.size()] 0140 void bisect(const FunctionNodeRange Nodes, unsigned RecDepth, 0141 unsigned RootBucket, unsigned Offset, 0142 std::optional<BPThreadPool> &TP) const; 0143 0144 /// Run bisection iterations 0145 void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket, 0146 unsigned RightBucket, std::mt19937 &RNG) const; 0147 0148 /// Run a bisection iteration to improve the optimization goal 0149 /// \returns the total number of moved FunctionNodes 0150 unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, 0151 unsigned RightBucket, SignaturesT &Signatures, 0152 std::mt19937 &RNG) const; 0153 0154 /// Try to move \p N from one bucket to another 0155 /// \returns true iff \p N is moved 0156 bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, 0157 unsigned RightBucket, SignaturesT &Signatures, 0158 std::mt19937 &RNG) const; 0159 0160 /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + 0161 /// 1 The method is used for an initial assignment before a bisection step 0162 void split(const FunctionNodeRange Nodes, unsigned StartBucket) const; 0163 0164 /// The cost of the uniform log-gap cost, assuming a utility node has \p X 0165 /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. 0166 float logCost(unsigned X, unsigned Y) const; 0167 0168 float log2Cached(unsigned i) const; 0169 0170 const BalancedPartitioningConfig &Config; 0171 0172 /// Precomputed values of log2(x). Table size is small enough to fit in cache. 0173 static constexpr unsigned LOG_CACHE_SIZE = 16384; 0174 float Log2Cache[LOG_CACHE_SIZE]; 0175 0176 /// The signature of a particular utility node used for the bisection step, 0177 /// i.e., the number of \p FunctionNodes in each of the two buckets 0178 struct UtilitySignature { 0179 /// The number of \p FunctionNodes in the left bucket 0180 unsigned LeftCount = 0; 0181 /// The number of \p FunctionNodes in the right bucket 0182 unsigned RightCount = 0; 0183 /// The cached gain of moving a \p FunctionNode from the left bucket to the 0184 /// right bucket 0185 float CachedGainLR; 0186 /// The cached gain of moving a \p FunctionNode from the right bucket to the 0187 /// left bucket 0188 float CachedGainRL; 0189 /// Whether \p CachedGainLR and \p CachedGainRL are valid 0190 bool CachedGainIsValid = false; 0191 }; 0192 0193 protected: 0194 /// Compute the move gain for uniform log-gap cost 0195 static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, 0196 const SignaturesT &Signatures); 0197 friend class BalancedPartitioningTest_MoveGain_Test; 0198 }; 0199 0200 } // end namespace llvm 0201 0202 #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|