Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:43:33

0001 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 // Software pipelining (SWP) is an instruction scheduling technique for loops
0010 // that overlaps loop iterations and exploits ILP via compiler transformations.
0011 //
0012 // There are multiple methods for analyzing a loop and creating a schedule.
0013 // An example algorithm is Swing Modulo Scheduling (implemented by the
0014 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
0015 // for the task of actually rewriting a loop to adhere to the schedule, which
0016 // is what this file does.
0017 //
0018 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
0019 // that we only support single-block loops, so "block" and "loop" can be used
0020 // interchangably.
0021 //
0022 // The Cycle of an instruction defines a partial order of the instructions in
0023 // the remapped loop. Instructions within a cycle must not consume the output
0024 // of any instruction in the same cycle. Cycle information is assumed to have
0025 // been calculated such that the processor will execute instructions in
0026 // lock-step (for example in a VLIW ISA).
0027 //
0028 // The Stage of an instruction defines the mapping between logical loop
0029 // iterations and pipelined loop iterations. An example (unrolled) pipeline
0030 // may look something like:
0031 //
0032 //  I0[0]                      Execute instruction I0 of iteration 0
0033 //  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
0034 //         I1[1], I0[2]
0035 //                I1[2], I0[3]
0036 //
0037 // In the schedule for this unrolled sequence we would say that I0 was scheduled
0038 // in stage 0 and I1 in stage 1:
0039 //
0040 //  loop:
0041 //    [stage 0] x = I0
0042 //    [stage 1] I1 x (from stage 0)
0043 //
0044 // And to actually generate valid code we must insert a phi:
0045 //
0046 //  loop:
0047 //    x' = phi(x)
0048 //    x = I0
0049 //    I1 x'
0050 //
0051 // This is a simple example; the rules for how to generate correct code given
0052 // an arbitrary schedule containing loop-carried values are complex.
0053 //
0054 // Note that these examples only mention the steady-state kernel of the
0055 // generated loop; prologs and epilogs must be generated also that prime and
0056 // flush the pipeline. Doing so is nontrivial.
0057 //
0058 //===----------------------------------------------------------------------===//
0059 
0060 #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
0061 #define LLVM_CODEGEN_MODULOSCHEDULE_H
0062 
0063 #include "llvm/CodeGen/MachineFunction.h"
0064 #include "llvm/CodeGen/MachineLoopUtils.h"
0065 #include "llvm/CodeGen/TargetInstrInfo.h"
0066 #include "llvm/CodeGen/TargetSubtargetInfo.h"
0067 #include <deque>
0068 #include <map>
0069 #include <vector>
0070 
0071 namespace llvm {
0072 class MachineBasicBlock;
0073 class MachineLoop;
0074 class MachineRegisterInfo;
0075 class MachineInstr;
0076 class LiveIntervals;
0077 
0078 /// Represents a schedule for a single-block loop. For every instruction we
0079 /// maintain a Cycle and Stage.
0080 class ModuloSchedule {
0081 private:
0082   /// The block containing the loop instructions.
0083   MachineLoop *Loop;
0084 
0085   /// The instructions to be generated, in total order. Cycle provides a partial
0086   /// order; the total order within cycles has been decided by the schedule
0087   /// producer.
0088   std::vector<MachineInstr *> ScheduledInstrs;
0089 
0090   /// The cycle for each instruction.
0091   DenseMap<MachineInstr *, int> Cycle;
0092 
0093   /// The stage for each instruction.
0094   DenseMap<MachineInstr *, int> Stage;
0095 
0096   /// The number of stages in this schedule (Max(Stage) + 1).
0097   int NumStages;
0098 
0099 public:
0100   /// Create a new ModuloSchedule.
0101   /// \arg ScheduledInstrs The new loop instructions, in total resequenced
0102   ///    order.
0103   /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
0104   ///    not need to start at zero. ScheduledInstrs must be partially ordered by
0105   ///    Cycle.
0106   /// \arg Stage Stage index for all instructions in ScheduleInstrs.
0107   ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
0108                  std::vector<MachineInstr *> ScheduledInstrs,
0109                  DenseMap<MachineInstr *, int> Cycle,
0110                  DenseMap<MachineInstr *, int> Stage)
0111       : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
0112         Stage(std::move(Stage)) {
0113     NumStages = 0;
0114     for (auto &KV : this->Stage)
0115       NumStages = std::max(NumStages, KV.second);
0116     ++NumStages;
0117   }
0118 
0119   /// Return the single-block loop being scheduled.
0120   MachineLoop *getLoop() const { return Loop; }
0121 
0122   /// Return the number of stages contained in this schedule, which is the
0123   /// largest stage index + 1.
0124   int getNumStages() const { return NumStages; }
0125 
0126   /// Return the first cycle in the schedule, which is the cycle index of the
0127   /// first instruction.
0128   int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
0129 
0130   /// Return the final cycle in the schedule, which is the cycle index of the
0131   /// last instruction.
0132   int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
0133 
0134   /// Return the stage that MI is scheduled in, or -1.
0135   int getStage(MachineInstr *MI) {
0136     auto I = Stage.find(MI);
0137     return I == Stage.end() ? -1 : I->second;
0138   }
0139 
0140   /// Return the cycle that MI is scheduled at, or -1.
0141   int getCycle(MachineInstr *MI) {
0142     auto I = Cycle.find(MI);
0143     return I == Cycle.end() ? -1 : I->second;
0144   }
0145 
0146   /// Set the stage of a newly created instruction.
0147   void setStage(MachineInstr *MI, int MIStage) {
0148     assert(Stage.count(MI) == 0);
0149     Stage[MI] = MIStage;
0150   }
0151 
0152   /// Return the rescheduled instructions in order.
0153   ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
0154 
0155   void dump() { print(dbgs()); }
0156   void print(raw_ostream &OS);
0157 };
0158 
0159 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
0160 /// rewriting the old loop and inserting prologs and epilogs as required.
0161 class ModuloScheduleExpander {
0162 public:
0163   using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
0164 
0165 private:
0166   using ValueMapTy = DenseMap<unsigned, unsigned>;
0167   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
0168   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
0169 
0170   ModuloSchedule &Schedule;
0171   MachineFunction &MF;
0172   const TargetSubtargetInfo &ST;
0173   MachineRegisterInfo &MRI;
0174   const TargetInstrInfo *TII = nullptr;
0175   LiveIntervals &LIS;
0176 
0177   MachineBasicBlock *BB = nullptr;
0178   MachineBasicBlock *Preheader = nullptr;
0179   MachineBasicBlock *NewKernel = nullptr;
0180   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
0181 
0182   /// Map for each register and the max difference between its uses and def.
0183   /// The first element in the pair is the max difference in stages. The
0184   /// second is true if the register defines a Phi value and loop value is
0185   /// scheduled before the Phi.
0186   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
0187 
0188   /// Instructions to change when emitting the final schedule.
0189   InstrChangesTy InstrChanges;
0190 
0191   void generatePipelinedLoop();
0192   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
0193                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
0194   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
0195                       MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
0196                       ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
0197                       MBBVectorTy &PrologBBs);
0198   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
0199                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
0200                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
0201                             unsigned LastStageNum, unsigned CurStageNum,
0202                             bool IsLast);
0203   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
0204                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
0205                     ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
0206                     InstrMapTy &InstrMap, unsigned LastStageNum,
0207                     unsigned CurStageNum, bool IsLast);
0208   void removeDeadInstructions(MachineBasicBlock *KernelBB,
0209                               MBBVectorTy &EpilogBBs);
0210   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
0211   void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
0212                    MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
0213                    ValueMapTy *VRMap);
0214   bool computeDelta(MachineInstr &MI, unsigned &Delta);
0215   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
0216                          unsigned Num);
0217   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
0218                            unsigned InstStageNum);
0219   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
0220                                     unsigned InstStageNum);
0221   void updateInstruction(MachineInstr *NewMI, bool LastDef,
0222                          unsigned CurStageNum, unsigned InstrStageNum,
0223                          ValueMapTy *VRMap);
0224   MachineInstr *findDefInLoop(unsigned Reg);
0225   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
0226                          unsigned LoopStage, ValueMapTy *VRMap,
0227                          MachineBasicBlock *BB);
0228   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
0229                         ValueMapTy *VRMap, InstrMapTy &InstrMap);
0230   void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
0231                              unsigned CurStageNum, unsigned PhiNum,
0232                              MachineInstr *Phi, unsigned OldReg,
0233                              unsigned NewReg, unsigned PrevReg = 0);
0234   bool isLoopCarried(MachineInstr &Phi);
0235 
0236   /// Return the max. number of stages/iterations that can occur between a
0237   /// register definition and its uses.
0238   unsigned getStagesForReg(int Reg, unsigned CurStage) {
0239     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
0240     if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
0241         Stages.second)
0242       return 1;
0243     return Stages.first;
0244   }
0245 
0246   /// The number of stages for a Phi is a little different than other
0247   /// instructions. The minimum value computed in RegToStageDiff is 1
0248   /// because we assume the Phi is needed for at least 1 iteration.
0249   /// This is not the case if the loop value is scheduled prior to the
0250   /// Phi in the same stage.  This function returns the number of stages
0251   /// or iterations needed between the Phi definition and any uses.
0252   unsigned getStagesForPhi(int Reg) {
0253     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
0254     if (Stages.second)
0255       return Stages.first;
0256     return Stages.first - 1;
0257   }
0258 
0259 public:
0260   /// Create a new ModuloScheduleExpander.
0261   /// \arg InstrChanges Modifications to make to instructions with memory
0262   ///   operands.
0263   /// FIXME: InstrChanges is opaque and is an implementation detail of an
0264   ///   optimization in MachinePipeliner that crosses abstraction boundaries.
0265   ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
0266                          LiveIntervals &LIS, InstrChangesTy InstrChanges)
0267       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
0268         TII(ST.getInstrInfo()), LIS(LIS),
0269         InstrChanges(std::move(InstrChanges)) {}
0270 
0271   /// Performs the actual expansion.
0272   void expand();
0273   /// Performs final cleanup after expansion.
0274   void cleanup();
0275 
0276   /// Returns the newly rewritten kernel block, or nullptr if this was
0277   /// optimized away.
0278   MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
0279 };
0280 
0281 /// A reimplementation of ModuloScheduleExpander. It works by generating a
0282 /// standalone kernel loop and peeling out the prologs and epilogs.
0283 class PeelingModuloScheduleExpander {
0284 public:
0285   PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
0286                                 LiveIntervals *LIS)
0287       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
0288         TII(ST.getInstrInfo()), LIS(LIS) {}
0289 
0290   void expand();
0291 
0292   /// Runs ModuloScheduleExpander and treats it as a golden input to validate
0293   /// aspects of the code generated by PeelingModuloScheduleExpander.
0294   void validateAgainstModuloScheduleExpander();
0295 
0296 protected:
0297   ModuloSchedule &Schedule;
0298   MachineFunction &MF;
0299   const TargetSubtargetInfo &ST;
0300   MachineRegisterInfo &MRI;
0301   const TargetInstrInfo *TII = nullptr;
0302   LiveIntervals *LIS = nullptr;
0303 
0304   /// The original loop block that gets rewritten in-place.
0305   MachineBasicBlock *BB = nullptr;
0306   /// The original loop preheader.
0307   MachineBasicBlock *Preheader = nullptr;
0308   /// All prolog and epilog blocks.
0309   SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
0310   /// For every block, the stages that are produced.
0311   DenseMap<MachineBasicBlock *, BitVector> LiveStages;
0312   /// For every block, the stages that are available. A stage can be available
0313   /// but not produced (in the epilog) or produced but not available (in the
0314   /// prolog).
0315   DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
0316   /// When peeling the epilogue keep track of the distance between the phi
0317   /// nodes and the kernel.
0318   DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
0319 
0320   /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
0321   /// loop kernel clones.
0322   DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
0323   DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
0324       BlockMIs;
0325 
0326   /// State passed from peelKernel to peelPrologAndEpilogs().
0327   std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
0328   /// Illegal phis that need to be deleted once we re-link stages.
0329   SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
0330 
0331   /// Converts BB from the original loop body to the rewritten, pipelined
0332   /// steady-state.
0333   void rewriteKernel();
0334 
0335   /// Peels one iteration of the rewritten kernel (BB) in the specified
0336   /// direction.
0337   MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
0338   // Delete instructions whose stage is less than MinStage in the given basic
0339   // block.
0340   void filterInstructions(MachineBasicBlock *MB, int MinStage);
0341   // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
0342   // instructions to keep a valid IR.
0343   void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
0344                               MachineBasicBlock *SourceBB, unsigned Stage);
0345   /// Peel the kernel forwards and backwards to produce prologs and epilogs,
0346   /// and stitch them together.
0347   void peelPrologAndEpilogs();
0348   /// All prolog and epilog blocks are clones of the kernel, so any produced
0349   /// register in one block has an corollary in all other blocks.
0350   Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
0351   /// Change all users of MI, if MI is predicated out
0352   /// (LiveStages[MI->getParent()] == false).
0353   void rewriteUsesOf(MachineInstr *MI);
0354   /// Insert branches between prologs, kernel and epilogs.
0355   void fixupBranches();
0356   /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
0357   /// to a block dominated by all prologs and epilogs. This allows us to treat
0358   /// the loop exiting block as any other kernel clone.
0359   MachineBasicBlock *CreateLCSSAExitingBlock();
0360   /// Helper to get the stage of an instruction in the schedule.
0361   unsigned getStage(MachineInstr *MI) {
0362     if (auto It = CanonicalMIs.find(MI); It != CanonicalMIs.end())
0363       MI = It->second;
0364     return Schedule.getStage(MI);
0365   }
0366   /// Helper function to find the right canonical register for a phi instruction
0367   /// coming from a peeled out prologue.
0368   Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
0369   /// Target loop info before kernel peeling.
0370   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
0371 };
0372 
0373 /// Expand the kernel using modulo variable expansion algorithm (MVE).
0374 /// It unrolls the kernel enough to avoid overlap of register lifetime.
0375 class ModuloScheduleExpanderMVE {
0376 private:
0377   using ValueMapTy = DenseMap<unsigned, unsigned>;
0378   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
0379   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
0380 
0381   ModuloSchedule &Schedule;
0382   MachineFunction &MF;
0383   const TargetSubtargetInfo &ST;
0384   MachineRegisterInfo &MRI;
0385   const TargetInstrInfo *TII = nullptr;
0386   LiveIntervals &LIS;
0387 
0388   MachineBasicBlock *OrigKernel = nullptr;
0389   MachineBasicBlock *OrigPreheader = nullptr;
0390   MachineBasicBlock *OrigExit = nullptr;
0391   MachineBasicBlock *Check = nullptr;
0392   MachineBasicBlock *Prolog = nullptr;
0393   MachineBasicBlock *NewKernel = nullptr;
0394   MachineBasicBlock *Epilog = nullptr;
0395   MachineBasicBlock *NewPreheader = nullptr;
0396   MachineBasicBlock *NewExit = nullptr;
0397   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
0398 
0399   /// The number of unroll required to avoid overlap of live ranges.
0400   /// NumUnroll = 1 means no unrolling.
0401   int NumUnroll;
0402 
0403   void calcNumUnroll();
0404   void generatePipelinedLoop();
0405   void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
0406   void generatePhi(MachineInstr *OrigMI, int UnrollNum,
0407                    SmallVectorImpl<ValueMapTy> &PrologVRMap,
0408                    SmallVectorImpl<ValueMapTy> &KernelVRMap,
0409                    SmallVectorImpl<ValueMapTy> &PhiVRMap);
0410   void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
0411                       SmallVectorImpl<ValueMapTy> &KernelVRMap,
0412                       InstrMapTy &LastStage0Insts);
0413   void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
0414                       SmallVectorImpl<ValueMapTy> &EpilogVRMap,
0415                       InstrMapTy &LastStage0Insts);
0416   void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
0417 
0418   MachineInstr *cloneInstr(MachineInstr *OldMI);
0419 
0420   void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef);
0421 
0422   void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
0423                          unsigned UnrollNum,
0424                          SmallVectorImpl<ValueMapTy> &VRMapProlog,
0425                          SmallVectorImpl<ValueMapTy> &VRMapPhi);
0426   void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum,
0427                       SmallVectorImpl<ValueMapTy> &CurVRMap,
0428                       SmallVectorImpl<ValueMapTy> *PrevVRMap);
0429 
0430   void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC,
0431                         InstrMapTy &LastStage0Insts,
0432                         MachineBasicBlock &GreaterThan,
0433                         MachineBasicBlock &Otherwise);
0434 
0435 public:
0436   ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
0437                             LiveIntervals &LIS)
0438       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
0439         TII(ST.getInstrInfo()), LIS(LIS) {}
0440 
0441   void expand();
0442   static bool canApply(MachineLoop &L);
0443 };
0444 
0445 /// Expander that simply annotates each scheduled instruction with a post-instr
0446 /// symbol that can be consumed by the ModuloScheduleTest pass.
0447 ///
0448 /// The post-instr symbol is a way of annotating an instruction that can be
0449 /// roundtripped in MIR. The syntax is:
0450 ///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
0451 class ModuloScheduleTestAnnotater {
0452   MachineFunction &MF;
0453   ModuloSchedule &S;
0454 
0455 public:
0456   ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
0457       : MF(MF), S(S) {}
0458 
0459   /// Performs the annotation.
0460   void annotate();
0461 };
0462 
0463 } // end namespace llvm
0464 
0465 #endif // LLVM_CODEGEN_MODULOSCHEDULE_H