MCA/HardwareUnits/LSUnit.h

0001 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 /// \file
0009 ///
0010 /// A Load/Store unit class that models load/store queues and that implements
0011 /// a simple weak memory consistency model.
0012 ///
0013 //===----------------------------------------------------------------------===//
0014
0015 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
0016 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
0017
0018 #include "llvm/ADT/DenseMap.h"
0019 #include "llvm/ADT/SmallVector.h"
0020 #include "llvm/MC/MCSchedule.h"
0021 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
0022 #include "llvm/MCA/Instruction.h"
0023
0024 namespace llvm {
0025 namespace mca {
0026
0027 /// Abstract base interface for LS (load/store) units in llvm-mca.
0028 class LSUnitBase : public HardwareUnit {
0029   /// Load queue size.
0030   ///
0031   /// A value of zero for this field means that the load queue is unbounded.
0032   /// Processor models can declare the size of a load queue via tablegen (see
0033   /// the definition of tablegen class LoadQueue in
0034   /// llvm/Target/TargetSchedule.td).
0035   unsigned LQSize;
0036
0037   /// Load queue size.
0038   ///
0039   /// A value of zero for this field means that the store queue is unbounded.
0040   /// Processor models can declare the size of a store queue via tablegen (see
0041   /// the definition of tablegen class StoreQueue in
0042   /// llvm/Target/TargetSchedule.td).
0043   unsigned SQSize;
0044
0045   unsigned UsedLQEntries;
0046   unsigned UsedSQEntries;
0047
0048   /// True if loads don't alias with stores.
0049   ///
0050   /// By default, the LS unit assumes that loads and stores don't alias with
0051   /// each other. If this field is set to false, then loads are always assumed
0052   /// to alias with stores.
0053   const bool NoAlias;
0054
0055 public:
0056   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
0057              unsigned StoreQueueSize, bool AssumeNoAlias);
0058
0059   virtual ~LSUnitBase();
0060
0061   /// Returns the total number of entries in the load queue.
0062   unsigned getLoadQueueSize() const { return LQSize; }
0063
0064   /// Returns the total number of entries in the store queue.
0065   unsigned getStoreQueueSize() const { return SQSize; }
0066
0067   unsigned getUsedLQEntries() const { return UsedLQEntries; }
0068   unsigned getUsedSQEntries() const { return UsedSQEntries; }
0069   void acquireLQSlot() { ++UsedLQEntries; }
0070   void acquireSQSlot() { ++UsedSQEntries; }
0071   void releaseLQSlot() { --UsedLQEntries; }
0072   void releaseSQSlot() { --UsedSQEntries; }
0073
0074   bool assumeNoAlias() const { return NoAlias; }
0075
0076   enum Status {
0077     LSU_AVAILABLE = 0,
0078     LSU_LQUEUE_FULL, // Load Queue unavailable
0079     LSU_SQUEUE_FULL  // Store Queue unavailable
0080   };
0081
0082   /// This method checks the availability of the load/store buffers.
0083   ///
0084   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
0085   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
0086   /// not a memory operation.
0087   virtual Status isAvailable(const InstRef &IR) const = 0;
0088
0089   /// Allocates LS resources for instruction IR.
0090   ///
0091   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
0092   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
0093   /// Returns the GroupID associated with this instruction. That value will be
0094   /// used to set the LSUTokenID field in class Instruction.
0095   virtual unsigned dispatch(const InstRef &IR) = 0;
0096
0097   bool isSQEmpty() const { return !UsedSQEntries; }
0098   bool isLQEmpty() const { return !UsedLQEntries; }
0099   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
0100   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
0101
0102   /// Check if a peviously dispatched instruction IR is now ready for execution.
0103   virtual bool isReady(const InstRef &IR) const = 0;
0104
0105   /// Check if instruction IR only depends on memory instructions that are
0106   /// currently executing.
0107   virtual bool isPending(const InstRef &IR) const = 0;
0108
0109   /// Check if instruction IR is still waiting on memory operations, and the
0110   /// wait time is still unknown.
0111   virtual bool isWaiting(const InstRef &IR) const = 0;
0112
0113   virtual bool hasDependentUsers(const InstRef &IR) const = 0;
0114
0115   virtual const CriticalDependency getCriticalPredecessor(unsigned GroupId) = 0;
0116
0117   virtual void onInstructionExecuted(const InstRef &IR) = 0;
0118
0119   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
0120   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
0121   // By default we conservatively assume that the LDQ receives a load at
0122   // dispatch. Loads leave the LDQ at retirement stage.
0123   virtual void onInstructionRetired(const InstRef &IR) = 0;
0124
0125   virtual void onInstructionIssued(const InstRef &IR) = 0;
0126
0127   virtual void cycleEvent() = 0;
0128
0129 #ifndef NDEBUG
0130   virtual void dump() const = 0;
0131 #endif
0132 };
0133
0134 /// Default Load/Store Unit (LS Unit) for simulated processors.
0135 ///
0136 /// Each load (or store) consumes one entry in the load (or store) queue.
0137 ///
0138 /// Rules are:
0139 /// 1) A younger load is allowed to pass an older load only if there are no
0140 ///    stores nor barriers in between the two loads.
0141 /// 2) An younger store is not allowed to pass an older store.
0142 /// 3) A younger store is not allowed to pass an older load.
0143 /// 4) A younger load is allowed to pass an older store only if the load does
0144 ///    not alias with the store.
0145 ///
0146 /// This class optimistically assumes that loads don't alias store operations.
0147 /// Under this assumption, younger loads are always allowed to pass older
0148 /// stores (this would only affects rule 4).
0149 /// Essentially, this class doesn't perform any sort alias analysis to
0150 /// identify aliasing loads and stores.
0151 ///
0152 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
0153 /// set to `false` by the constructor of LSUnit.
0154 ///
0155 /// Note that this class doesn't know about the existence of different memory
0156 /// types for memory operations (example: write-through, write-combining, etc.).
0157 /// Derived classes are responsible for implementing that extra knowledge, and
0158 /// provide different sets of rules for loads and stores by overriding method
0159 /// `isReady()`.
0160 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
0161 /// derived class to enable the reordering of non-aliasing store operations.
0162 ///
0163 /// No assumptions are made by this class on the size of the store buffer.  This
0164 /// class doesn't know how to identify cases where store-to-load forwarding may
0165 /// occur.
0166 ///
0167 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
0168 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
0169 /// cache hierarchy and memory types.
0170 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
0171 /// scheduling model provides an "optimistic" load-to-use latency (which usually
0172 /// matches the load-to-use latency for when there is a hit in the L1D).
0173 /// Derived classes may expand this knowledge.
0174 ///
0175 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
0176 /// memory-barrier like instructions.
0177 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
0178 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
0179 /// serializes loads without forcing a flush of the load queue.
0180 /// Similarly, instructions that both `mayStore` and have `unmodeled side
0181 /// effects` are treated like store barriers. A full memory
0182 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
0183 /// effects. This is obviously inaccurate, but this is the best that we can do
0184 /// at the moment.
0185 ///
0186 /// Each load/store barrier consumes one entry in the load/store queue. A
0187 /// load/store barrier enforces ordering of loads/stores:
0188 ///  - A younger load cannot pass a load barrier.
0189 ///  - A younger store cannot pass a store barrier.
0190 ///
0191 /// A younger load has to wait for the memory load barrier to execute.
0192 /// A load/store barrier is "executed" when it becomes the oldest entry in
0193 /// the load/store queue(s). That also means, all the older loads/stores have
0194 /// already been executed.
0195 class LSUnit : public LSUnitBase {
0196
0197   // This class doesn't know about the latency of a load instruction. So, it
0198   // conservatively/pessimistically assumes that the latency of a load opcode
0199   // matches the instruction latency.
0200   //
0201   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
0202   // and load/store conflicts, the latency of a load is determined by the depth
0203   // of the load pipeline. So, we could use field `LoadLatency` in the
0204   // MCSchedModel to model that latency.
0205   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
0206   // L1D, and it usually already accounts for any extra latency due to data
0207   // forwarding.
0208   // When doing throughput analysis, `LoadLatency` is likely to
0209   // be a better predictor of load latency than instruction latency. This is
0210   // particularly true when simulating code with temporal/spatial locality of
0211   // memory accesses.
0212   // Using `LoadLatency` (instead of the instruction latency) is also expected
0213   // to improve the load queue allocation for long latency instructions with
0214   // folded memory operands (See PR39829).
0215   //
0216   // FIXME: On some processors, load/store operations are split into multiple
0217   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
0218   // not 256-bit data types. So, a 256-bit load is effectively split into two
0219   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
0220   // simplicity, this class optimistically assumes that a load instruction only
0221   // consumes one entry in the LoadQueue.  Similarly, store instructions only
0222   // consume a single entry in the StoreQueue.
0223   // In future, we should reassess the quality of this design, and consider
0224   // alternative approaches that let instructions specify the number of
0225   // load/store queue entries which they consume at dispatch stage (See
0226   // PR39830).
0227   //
0228   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
0229   // conservatively treated as a store barrier. It forces older store to be
0230   // executed before newer stores are issued.
0231   //
0232   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
0233   // conservatively treated as a load barrier. It forces older loads to execute
0234   // before newer loads are issued.
0235
0236 protected:
0237   /// A node of a memory dependency graph. A MemoryGroup describes a set of
0238   /// instructions with same memory dependencies.
0239   ///
0240   /// By construction, instructions of a MemoryGroup don't depend on each other.
0241   /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
0242   /// A Memory group identifier is then stored as a "token" in field
0243   /// Instruction::LSUTokenID of each dispatched instructions. That token is
0244   /// used internally by the LSUnit to track memory dependencies.
0245   class MemoryGroup {
0246     unsigned NumPredecessors = 0;
0247     unsigned NumExecutingPredecessors = 0;
0248     unsigned NumExecutedPredecessors = 0;
0249
0250     unsigned NumInstructions = 0;
0251     unsigned NumExecuting = 0;
0252     unsigned NumExecuted = 0;
0253     // Successors that are in a order dependency with this group.
0254     SmallVector<MemoryGroup *, 4> OrderSucc;
0255     // Successors that are in a data dependency with this group.
0256     SmallVector<MemoryGroup *, 4> DataSucc;
0257
0258     CriticalDependency CriticalPredecessor;
0259     InstRef CriticalMemoryInstruction;
0260
0261     MemoryGroup(const MemoryGroup &) = delete;
0262     MemoryGroup &operator=(const MemoryGroup &) = delete;
0263
0264   public:
0265     MemoryGroup() = default;
0266     MemoryGroup(MemoryGroup &&) = default;
0267
0268     size_t getNumSuccessors() const {
0269       return OrderSucc.size() + DataSucc.size();
0270     }
0271     unsigned getNumPredecessors() const { return NumPredecessors; }
0272     unsigned getNumExecutingPredecessors() const {
0273       return NumExecutingPredecessors;
0274     }
0275     unsigned getNumExecutedPredecessors() const {
0276       return NumExecutedPredecessors;
0277     }
0278     unsigned getNumInstructions() const { return NumInstructions; }
0279     unsigned getNumExecuting() const { return NumExecuting; }
0280     unsigned getNumExecuted() const { return NumExecuted; }
0281
0282     const InstRef &getCriticalMemoryInstruction() const {
0283       return CriticalMemoryInstruction;
0284     }
0285     const CriticalDependency &getCriticalPredecessor() const {
0286       return CriticalPredecessor;
0287     }
0288
0289     void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
0290       // Do not need to add a dependency if there is no data
0291       // dependency and all instructions from this group have been
0292       // issued already.
0293       if (!IsDataDependent && isExecuting())
0294         return;
0295
0296       Group->NumPredecessors++;
0297       assert(!isExecuted() && "Should have been removed!");
0298       if (isExecuting())
0299         Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
0300
0301       if (IsDataDependent)
0302         DataSucc.emplace_back(Group);
0303       else
0304         OrderSucc.emplace_back(Group);
0305     }
0306
0307     bool isWaiting() const {
0308       return NumPredecessors >
0309              (NumExecutingPredecessors + NumExecutedPredecessors);
0310     }
0311     bool isPending() const {
0312       return NumExecutingPredecessors &&
0313              ((NumExecutedPredecessors + NumExecutingPredecessors) ==
0314               NumPredecessors);
0315     }
0316     bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
0317     bool isExecuting() const {
0318       return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
0319     }
0320     bool isExecuted() const { return NumInstructions == NumExecuted; }
0321
0322     void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
0323       assert(!isReady() && "Unexpected group-start event!");
0324       NumExecutingPredecessors++;
0325
0326       if (!ShouldUpdateCriticalDep)
0327         return;
0328
0329       unsigned Cycles = IR.getInstruction()->getCyclesLeft();
0330       if (CriticalPredecessor.Cycles < Cycles) {
0331         CriticalPredecessor.IID = IR.getSourceIndex();
0332         CriticalPredecessor.Cycles = Cycles;
0333       }
0334     }
0335
0336     void onGroupExecuted() {
0337       assert(!isReady() && "Inconsistent state found!");
0338       NumExecutingPredecessors--;
0339       NumExecutedPredecessors++;
0340     }
0341
0342     void onInstructionIssued(const InstRef &IR) {
0343       assert(!isExecuting() && "Invalid internal state!");
0344       ++NumExecuting;
0345
0346       // update the CriticalMemDep.
0347       const Instruction &IS = *IR.getInstruction();
0348       if ((bool)CriticalMemoryInstruction) {
0349         const Instruction &OtherIS =
0350             *CriticalMemoryInstruction.getInstruction();
0351         if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
0352           CriticalMemoryInstruction = IR;
0353       } else {
0354         CriticalMemoryInstruction = IR;
0355       }
0356
0357       if (!isExecuting())
0358         return;
0359
0360       // Notify successors that this group started execution.
0361       for (MemoryGroup *MG : OrderSucc) {
0362         MG->onGroupIssued(CriticalMemoryInstruction, false);
0363         // Release the order dependency with this group.
0364         MG->onGroupExecuted();
0365       }
0366
0367       for (MemoryGroup *MG : DataSucc)
0368         MG->onGroupIssued(CriticalMemoryInstruction, true);
0369     }
0370
0371     void onInstructionExecuted(const InstRef &IR) {
0372       assert(isReady() && !isExecuted() && "Invalid internal state!");
0373       --NumExecuting;
0374       ++NumExecuted;
0375
0376       if (CriticalMemoryInstruction &&
0377           CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
0378         CriticalMemoryInstruction.invalidate();
0379       }
0380
0381       if (!isExecuted())
0382         return;
0383
0384       // Notify data dependent successors that this group has finished
0385       // execution.
0386       for (MemoryGroup *MG : DataSucc)
0387         MG->onGroupExecuted();
0388     }
0389
0390     void addInstruction() {
0391       assert(!getNumSuccessors() && "Cannot add instructions to this group!");
0392       ++NumInstructions;
0393     }
0394
0395     void cycleEvent() {
0396       if (isWaiting() && CriticalPredecessor.Cycles)
0397         CriticalPredecessor.Cycles--;
0398     }
0399   };
0400   /// Used to map group identifiers to MemoryGroups.
0401   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
0402   unsigned NextGroupID = 1;
0403
0404   unsigned CurrentLoadGroupID;
0405   unsigned CurrentLoadBarrierGroupID;
0406   unsigned CurrentStoreGroupID;
0407   unsigned CurrentStoreBarrierGroupID;
0408
0409 public:
0410   LSUnit(const MCSchedModel &SM)
0411       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
0412   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
0413       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
0414   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
0415       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
0416         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
0417         CurrentStoreBarrierGroupID(0) {}
0418
0419   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
0420   /// accomodate instruction IR.
0421   Status isAvailable(const InstRef &IR) const override;
0422
0423   bool isReady(const InstRef &IR) const override {
0424     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
0425     const MemoryGroup &Group = getGroup(GroupID);
0426     return Group.isReady();
0427   }
0428
0429   bool isPending(const InstRef &IR) const override {
0430     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
0431     const MemoryGroup &Group = getGroup(GroupID);
0432     return Group.isPending();
0433   }
0434
0435   bool isWaiting(const InstRef &IR) const override {
0436     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
0437     const MemoryGroup &Group = getGroup(GroupID);
0438     return Group.isWaiting();
0439   }
0440
0441   bool hasDependentUsers(const InstRef &IR) const override {
0442     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
0443     const MemoryGroup &Group = getGroup(GroupID);
0444     return !Group.isExecuted() && Group.getNumSuccessors();
0445   }
0446
0447   const CriticalDependency getCriticalPredecessor(unsigned GroupId) override {
0448     const MemoryGroup &Group = getGroup(GroupId);
0449     return Group.getCriticalPredecessor();
0450   }
0451
0452   /// Allocates LS resources for instruction IR.
0453   ///
0454   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
0455   /// returning LSU_AVAILABLE.
0456   ///
0457   /// Rules are:
0458   /// By default, rules are:
0459   /// 1. A store may not pass a previous store.
0460   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
0461   /// 3. A load may pass a previous load.
0462   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
0463   /// 5. A load has to wait until an older load barrier is fully executed.
0464   /// 6. A store has to wait until an older store barrier is fully executed.
0465   unsigned dispatch(const InstRef &IR) override;
0466
0467   virtual void onInstructionIssued(const InstRef &IR) override {
0468     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
0469     Groups[GroupID]->onInstructionIssued(IR);
0470   }
0471
0472   virtual void onInstructionRetired(const InstRef &IR) override;
0473
0474   virtual void onInstructionExecuted(const InstRef &IR) override;
0475
0476   virtual void cycleEvent() override;
0477
0478 #ifndef NDEBUG
0479   virtual void dump() const override;
0480 #endif
0481
0482 private:
0483   bool isValidGroupID(unsigned Index) const {
0484     return Index && Groups.contains(Index);
0485   }
0486
0487   const MemoryGroup &getGroup(unsigned Index) const {
0488     assert(isValidGroupID(Index) && "Group doesn't exist!");
0489     return *Groups.find(Index)->second;
0490   }
0491
0492   MemoryGroup &getGroup(unsigned Index) {
0493     assert(isValidGroupID(Index) && "Group doesn't exist!");
0494     return *Groups.find(Index)->second;
0495   }
0496
0497   unsigned createMemoryGroup() {
0498     Groups.insert(std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
0499     return NextGroupID++;
0500   }
0501 };
0502
0503 } // namespace mca
0504 } // namespace llvm
0505
0506 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H