llvm/CodeGen/BasicTTIImpl.h

0001 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 /// \file
0010 /// This file provides a helper that implements much of the TTI interface in
0011 /// terms of the target-independent code generator and TargetLowering
0012 /// interfaces.
0013 //
0014 //===----------------------------------------------------------------------===//
0015
0016 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
0017 #define LLVM_CODEGEN_BASICTTIIMPL_H
0018
0019 #include "llvm/ADT/APInt.h"
0020 #include "llvm/ADT/BitVector.h"
0021 #include "llvm/ADT/SmallPtrSet.h"
0022 #include "llvm/ADT/SmallVector.h"
0023 #include "llvm/Analysis/LoopInfo.h"
0024 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
0025 #include "llvm/Analysis/TargetTransformInfo.h"
0026 #include "llvm/Analysis/TargetTransformInfoImpl.h"
0027 #include "llvm/Analysis/ValueTracking.h"
0028 #include "llvm/CodeGen/ISDOpcodes.h"
0029 #include "llvm/CodeGen/TargetLowering.h"
0030 #include "llvm/CodeGen/TargetSubtargetInfo.h"
0031 #include "llvm/CodeGen/ValueTypes.h"
0032 #include "llvm/CodeGenTypes/MachineValueType.h"
0033 #include "llvm/IR/BasicBlock.h"
0034 #include "llvm/IR/Constant.h"
0035 #include "llvm/IR/Constants.h"
0036 #include "llvm/IR/DataLayout.h"
0037 #include "llvm/IR/DerivedTypes.h"
0038 #include "llvm/IR/InstrTypes.h"
0039 #include "llvm/IR/Instruction.h"
0040 #include "llvm/IR/Instructions.h"
0041 #include "llvm/IR/Intrinsics.h"
0042 #include "llvm/IR/Operator.h"
0043 #include "llvm/IR/Type.h"
0044 #include "llvm/IR/Value.h"
0045 #include "llvm/Support/Casting.h"
0046 #include "llvm/Support/CommandLine.h"
0047 #include "llvm/Support/ErrorHandling.h"
0048 #include "llvm/Support/MathExtras.h"
0049 #include "llvm/Target/TargetMachine.h"
0050 #include "llvm/Target/TargetOptions.h"
0051 #include "llvm/Transforms/Utils/LoopUtils.h"
0052 #include <algorithm>
0053 #include <cassert>
0054 #include <cstdint>
0055 #include <limits>
0056 #include <optional>
0057 #include <utility>
0058
0059 namespace llvm {
0060
0061 class Function;
0062 class GlobalValue;
0063 class LLVMContext;
0064 class ScalarEvolution;
0065 class SCEV;
0066 class TargetMachine;
0067
0068 extern cl::opt<unsigned> PartialUnrollingThreshold;
0069
0070 /// Base class which can be used to help build a TTI implementation.
0071 ///
0072 /// This class provides as much implementation of the TTI interface as is
0073 /// possible using the target independent parts of the code generator.
0074 ///
0075 /// In order to subclass it, your class must implement a getST() method to
0076 /// return the subtarget, and a getTLI() method to return the target lowering.
0077 /// We need these methods implemented in the derived class so that this class
0078 /// doesn't have to duplicate storage for them.
0079 template <typename T>
0080 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
0081 private:
0082   using BaseT = TargetTransformInfoImplCRTPBase<T>;
0083   using TTI = TargetTransformInfo;
0084
0085   /// Helper function to access this as a T.
0086   T *thisT() { return static_cast<T *>(this); }
0087
0088   /// Estimate a cost of Broadcast as an extract and sequence of insert
0089   /// operations.
0090   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
0091                                               TTI::TargetCostKind CostKind) {
0092     InstructionCost Cost = 0;
0093     // Broadcast cost is equal to the cost of extracting the zero'th element
0094     // plus the cost of inserting it into every element of the result vector.
0095     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
0096                                         CostKind, 0, nullptr, nullptr);
0097
0098     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
0099       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
0100                                           CostKind, i, nullptr, nullptr);
0101     }
0102     return Cost;
0103   }
0104
0105   /// Estimate a cost of shuffle as a sequence of extract and insert
0106   /// operations.
0107   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
0108                                             TTI::TargetCostKind CostKind) {
0109     InstructionCost Cost = 0;
0110     // Shuffle cost is equal to the cost of extracting element from its argument
0111     // plus the cost of inserting them onto the result vector.
0112
0113     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
0114     // index 0 of first vector, index 1 of second vector,index 2 of first
0115     // vector and finally index 3 of second vector and insert them at index
0116     // <0,1,2,3> of result vector.
0117     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
0118       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
0119                                           CostKind, i, nullptr, nullptr);
0120       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
0121                                           CostKind, i, nullptr, nullptr);
0122     }
0123     return Cost;
0124   }
0125
0126   /// Estimate a cost of subvector extraction as a sequence of extract and
0127   /// insert operations.
0128   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
0129                                               TTI::TargetCostKind CostKind,
0130                                               int Index,
0131                                               FixedVectorType *SubVTy) {
0132     assert(VTy && SubVTy &&
0133            "Can only extract subvectors from vectors");
0134     int NumSubElts = SubVTy->getNumElements();
0135     assert((!isa<FixedVectorType>(VTy) ||
0136             (Index + NumSubElts) <=
0137                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
0138            "SK_ExtractSubvector index out of range");
0139
0140     InstructionCost Cost = 0;
0141     // Subvector extraction cost is equal to the cost of extracting element from
0142     // the source type plus the cost of inserting them into the result vector
0143     // type.
0144     for (int i = 0; i != NumSubElts; ++i) {
0145       Cost +=
0146           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
0147                                       CostKind, i + Index, nullptr, nullptr);
0148       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
0149                                           CostKind, i, nullptr, nullptr);
0150     }
0151     return Cost;
0152   }
0153
0154   /// Estimate a cost of subvector insertion as a sequence of extract and
0155   /// insert operations.
0156   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
0157                                              TTI::TargetCostKind CostKind,
0158                                              int Index,
0159                                              FixedVectorType *SubVTy) {
0160     assert(VTy && SubVTy &&
0161            "Can only insert subvectors into vectors");
0162     int NumSubElts = SubVTy->getNumElements();
0163     assert((!isa<FixedVectorType>(VTy) ||
0164             (Index + NumSubElts) <=
0165                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
0166            "SK_InsertSubvector index out of range");
0167
0168     InstructionCost Cost = 0;
0169     // Subvector insertion cost is equal to the cost of extracting element from
0170     // the source type plus the cost of inserting them into the result vector
0171     // type.
0172     for (int i = 0; i != NumSubElts; ++i) {
0173       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
0174                                           CostKind, i, nullptr, nullptr);
0175       Cost +=
0176           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
0177                                       i + Index, nullptr, nullptr);
0178     }
0179     return Cost;
0180   }
0181
0182   /// Local query method delegates up to T which *must* implement this!
0183   const TargetSubtargetInfo *getST() const {
0184     return static_cast<const T *>(this)->getST();
0185   }
0186
0187   /// Local query method delegates up to T which *must* implement this!
0188   const TargetLoweringBase *getTLI() const {
0189     return static_cast<const T *>(this)->getTLI();
0190   }
0191
0192   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
0193     switch (M) {
0194       case TTI::MIM_Unindexed:
0195         return ISD::UNINDEXED;
0196       case TTI::MIM_PreInc:
0197         return ISD::PRE_INC;
0198       case TTI::MIM_PreDec:
0199         return ISD::PRE_DEC;
0200       case TTI::MIM_PostInc:
0201         return ISD::POST_INC;
0202       case TTI::MIM_PostDec:
0203         return ISD::POST_DEC;
0204     }
0205     llvm_unreachable("Unexpected MemIndexedMode");
0206   }
0207
0208   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
0209                                               Align Alignment,
0210                                               bool VariableMask,
0211                                               bool IsGatherScatter,
0212                                               TTI::TargetCostKind CostKind,
0213                                               unsigned AddressSpace = 0) {
0214     // We cannot scalarize scalable vectors, so return Invalid.
0215     if (isa<ScalableVectorType>(DataTy))
0216       return InstructionCost::getInvalid();
0217
0218     auto *VT = cast<FixedVectorType>(DataTy);
0219     unsigned VF = VT->getNumElements();
0220
0221     // Assume the target does not have support for gather/scatter operations
0222     // and provide a rough estimate.
0223     //
0224     // First, compute the cost of the individual memory operations.
0225     InstructionCost AddrExtractCost =
0226         IsGatherScatter ? getScalarizationOverhead(
0227                               FixedVectorType::get(
0228                                   PointerType::get(VT->getContext(), 0), VF),
0229                               /*Insert=*/false, /*Extract=*/true, CostKind)
0230                         : 0;
0231
0232     // The cost of the scalar loads/stores.
0233     InstructionCost MemoryOpCost =
0234         VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
0235                                       AddressSpace, CostKind);
0236
0237     // Next, compute the cost of packing the result in a vector.
0238     InstructionCost PackingCost =
0239         getScalarizationOverhead(VT, Opcode != Instruction::Store,
0240                                  Opcode == Instruction::Store, CostKind);
0241
0242     InstructionCost ConditionalCost = 0;
0243     if (VariableMask) {
0244       // Compute the cost of conditionally executing the memory operations with
0245       // variable masks. This includes extracting the individual conditions, a
0246       // branches and PHIs to combine the results.
0247       // NOTE: Estimating the cost of conditionally executing the memory
0248       // operations accurately is quite difficult and the current solution
0249       // provides a very rough estimate only.
0250       ConditionalCost =
0251           getScalarizationOverhead(
0252               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
0253               /*Insert=*/false, /*Extract=*/true, CostKind) +
0254           VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
0255                 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
0256     }
0257
0258     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
0259   }
0260
0261   /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
0262   /// or same non -1 index value and this index value contained at least twice.
0263   /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
0264   /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
0265   /// with \p Index=2.
0266   static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
0267     // Check that the broadcast index meets at least twice.
0268     bool IsCompared = false;
0269     if (int SplatIdx = PoisonMaskElem;
0270         all_of(enumerate(Mask), [&](const auto &P) {
0271           if (P.value() == PoisonMaskElem)
0272             return P.index() != Mask.size() - 1 || IsCompared;
0273           if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
0274             return false;
0275           if (SplatIdx == PoisonMaskElem) {
0276             SplatIdx = P.value();
0277             return P.index() != Mask.size() - 1;
0278           }
0279           IsCompared = true;
0280           return SplatIdx == P.value();
0281         })) {
0282       Index = SplatIdx;
0283       return true;
0284     }
0285     return false;
0286   }
0287
0288 protected:
0289   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
0290       : BaseT(DL) {}
0291   virtual ~BasicTTIImplBase() = default;
0292
0293   using TargetTransformInfoImplBase::DL;
0294
0295 public:
0296   /// \name Scalar TTI Implementations
0297   /// @{
0298   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
0299                                       unsigned AddressSpace, Align Alignment,
0300                                       unsigned *Fast) const {
0301     EVT E = EVT::getIntegerVT(Context, BitWidth);
0302     return getTLI()->allowsMisalignedMemoryAccesses(
0303         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
0304   }
0305
0306   bool areInlineCompatible(const Function *Caller,
0307                            const Function *Callee) const {
0308     const TargetMachine &TM = getTLI()->getTargetMachine();
0309
0310     const FeatureBitset &CallerBits =
0311         TM.getSubtargetImpl(*Caller)->getFeatureBits();
0312     const FeatureBitset &CalleeBits =
0313         TM.getSubtargetImpl(*Callee)->getFeatureBits();
0314
0315     // Inline a callee if its target-features are a subset of the callers
0316     // target-features.
0317     return (CallerBits & CalleeBits) == CalleeBits;
0318   }
0319
0320   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
0321
0322   bool isSourceOfDivergence(const Value *V) { return false; }
0323
0324   bool isAlwaysUniform(const Value *V) { return false; }
0325
0326   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
0327     return false;
0328   }
0329
0330   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
0331     return true;
0332   }
0333
0334   unsigned getFlatAddressSpace() {
0335     // Return an invalid address space.
0336     return -1;
0337   }
0338
0339   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
0340                                   Intrinsic::ID IID) const {
0341     return false;
0342   }
0343
0344   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
0345     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
0346   }
0347
0348   unsigned getAssumedAddrSpace(const Value *V) const {
0349     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
0350   }
0351
0352   bool isSingleThreaded() const {
0353     return getTLI()->getTargetMachine().Options.ThreadModel ==
0354            ThreadModel::Single;
0355   }
0356
0357   std::pair<const Value *, unsigned>
0358   getPredicatedAddrSpace(const Value *V) const {
0359     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
0360   }
0361
0362   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
0363                                           Value *NewV) const {
0364     return nullptr;
0365   }
0366
0367   bool isLegalAddImmediate(int64_t imm) {
0368     return getTLI()->isLegalAddImmediate(imm);
0369   }
0370
0371   bool isLegalAddScalableImmediate(int64_t Imm) {
0372     return getTLI()->isLegalAddScalableImmediate(Imm);
0373   }
0374
0375   bool isLegalICmpImmediate(int64_t imm) {
0376     return getTLI()->isLegalICmpImmediate(imm);
0377   }
0378
0379   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
0380                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
0381                              Instruction *I = nullptr,
0382                              int64_t ScalableOffset = 0) {
0383     TargetLoweringBase::AddrMode AM;
0384     AM.BaseGV = BaseGV;
0385     AM.BaseOffs = BaseOffset;
0386     AM.HasBaseReg = HasBaseReg;
0387     AM.Scale = Scale;
0388     AM.ScalableOffset = ScalableOffset;
0389     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
0390   }
0391
0392   int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
0393     return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
0394   }
0395
0396   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
0397                              Type *ScalarValTy) const {
0398     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
0399       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
0400       EVT VT = getTLI()->getValueType(DL, SrcTy);
0401       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
0402           getTLI()->isOperationCustom(ISD::STORE, VT))
0403         return true;
0404
0405       EVT ValVT =
0406           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
0407       EVT LegalizedVT =
0408           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
0409       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
0410     };
0411     while (VF > 2 && IsSupportedByTarget(VF))
0412       VF /= 2;
0413     return VF;
0414   }
0415
0416   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
0417                           const DataLayout &DL) const {
0418     EVT VT = getTLI()->getValueType(DL, Ty);
0419     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
0420   }
0421
0422   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
0423                            const DataLayout &DL) const {
0424     EVT VT = getTLI()->getValueType(DL, Ty);
0425     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
0426   }
0427
0428   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
0429     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
0430   }
0431
0432   bool isNumRegsMajorCostOfLSR() {
0433     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
0434   }
0435
0436   bool shouldDropLSRSolutionIfLessProfitable() const {
0437     return TargetTransformInfoImplBase::shouldDropLSRSolutionIfLessProfitable();
0438   }
0439
0440   bool isProfitableLSRChainElement(Instruction *I) {
0441     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
0442   }
0443
0444   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
0445                                        StackOffset BaseOffset, bool HasBaseReg,
0446                                        int64_t Scale, unsigned AddrSpace) {
0447     TargetLoweringBase::AddrMode AM;
0448     AM.BaseGV = BaseGV;
0449     AM.BaseOffs = BaseOffset.getFixed();
0450     AM.HasBaseReg = HasBaseReg;
0451     AM.Scale = Scale;
0452     AM.ScalableOffset = BaseOffset.getScalable();
0453     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
0454       return 0;
0455     return -1;
0456   }
0457
0458   bool isTruncateFree(Type *Ty1, Type *Ty2) {
0459     return getTLI()->isTruncateFree(Ty1, Ty2);
0460   }
0461
0462   bool isProfitableToHoist(Instruction *I) {
0463     return getTLI()->isProfitableToHoist(I);
0464   }
0465
0466   bool useAA() const { return getST()->useAA(); }
0467
0468   bool isTypeLegal(Type *Ty) {
0469     EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
0470     return getTLI()->isTypeLegal(VT);
0471   }
0472
0473   unsigned getRegUsageForType(Type *Ty) {
0474     EVT ETy = getTLI()->getValueType(DL, Ty);
0475     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
0476   }
0477
0478   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
0479                              ArrayRef<const Value *> Operands, Type *AccessType,
0480                              TTI::TargetCostKind CostKind) {
0481     return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
0482   }
0483
0484   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
0485                                             unsigned &JumpTableSize,
0486                                             ProfileSummaryInfo *PSI,
0487                                             BlockFrequencyInfo *BFI) {
0488     /// Try to find the estimated number of clusters. Note that the number of
0489     /// clusters identified in this function could be different from the actual
0490     /// numbers found in lowering. This function ignore switches that are
0491     /// lowered with a mix of jump table / bit test / BTree. This function was
0492     /// initially intended to be used when estimating the cost of switch in
0493     /// inline cost heuristic, but it's a generic cost model to be used in other
0494     /// places (e.g., in loop unrolling).
0495     unsigned N = SI.getNumCases();
0496     const TargetLoweringBase *TLI = getTLI();
0497     const DataLayout &DL = this->getDataLayout();
0498
0499     JumpTableSize = 0;
0500     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
0501
0502     // Early exit if both a jump table and bit test are not allowed.
0503     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
0504       return N;
0505
0506     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
0507     APInt MinCaseVal = MaxCaseVal;
0508     for (auto CI : SI.cases()) {
0509       const APInt &CaseVal = CI.getCaseValue()->getValue();
0510       if (CaseVal.sgt(MaxCaseVal))
0511         MaxCaseVal = CaseVal;
0512       if (CaseVal.slt(MinCaseVal))
0513         MinCaseVal = CaseVal;
0514     }
0515
0516     // Check if suitable for a bit test
0517     if (N <= DL.getIndexSizeInBits(0u)) {
0518       SmallPtrSet<const BasicBlock *, 4> Dests;
0519       for (auto I : SI.cases())
0520         Dests.insert(I.getCaseSuccessor());
0521
0522       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
0523                                      DL))
0524         return 1;
0525     }
0526
0527     // Check if suitable for a jump table.
0528     if (IsJTAllowed) {
0529       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
0530         return N;
0531       uint64_t Range =
0532           (MaxCaseVal - MinCaseVal)
0533               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
0534       // Check whether a range of clusters is dense enough for a jump table
0535       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
0536         JumpTableSize = Range;
0537         return 1;
0538       }
0539     }
0540     return N;
0541   }
0542
0543   bool shouldBuildLookupTables() {
0544     const TargetLoweringBase *TLI = getTLI();
0545     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
0546            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
0547   }
0548
0549   bool shouldBuildRelLookupTables() const {
0550     const TargetMachine &TM = getTLI()->getTargetMachine();
0551     // If non-PIC mode, do not generate a relative lookup table.
0552     if (!TM.isPositionIndependent())
0553       return false;
0554
0555     /// Relative lookup table entries consist of 32-bit offsets.
0556     /// Do not generate relative lookup tables for large code models
0557     /// in 64-bit achitectures where 32-bit offsets might not be enough.
0558     if (TM.getCodeModel() == CodeModel::Medium ||
0559         TM.getCodeModel() == CodeModel::Large)
0560       return false;
0561
0562     const Triple &TargetTriple = TM.getTargetTriple();
0563     if (!TargetTriple.isArch64Bit())
0564       return false;
0565
0566     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
0567     // there.
0568     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
0569       return false;
0570
0571     return true;
0572   }
0573
0574   bool haveFastSqrt(Type *Ty) {
0575     const TargetLoweringBase *TLI = getTLI();
0576     EVT VT = TLI->getValueType(DL, Ty);
0577     return TLI->isTypeLegal(VT) &&
0578            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
0579   }
0580
0581   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
0582     return true;
0583   }
0584
0585   InstructionCost getFPOpCost(Type *Ty) {
0586     // Check whether FADD is available, as a proxy for floating-point in
0587     // general.
0588     const TargetLoweringBase *TLI = getTLI();
0589     EVT VT = TLI->getValueType(DL, Ty);
0590     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
0591       return TargetTransformInfo::TCC_Basic;
0592     return TargetTransformInfo::TCC_Expensive;
0593   }
0594
0595   bool preferToKeepConstantsAttached(const Instruction &Inst,
0596                                      const Function &Fn) const {
0597     switch (Inst.getOpcode()) {
0598     default:
0599       break;
0600     case Instruction::SDiv:
0601     case Instruction::SRem:
0602     case Instruction::UDiv:
0603     case Instruction::URem: {
0604       if (!isa<ConstantInt>(Inst.getOperand(1)))
0605         return false;
0606       EVT VT = getTLI()->getValueType(DL, Inst.getType());
0607       return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
0608     }
0609     };
0610
0611     return false;
0612   }
0613
0614   unsigned getInliningThresholdMultiplier() const { return 1; }
0615   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
0616   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
0617     return 0;
0618   }
0619
0620   int getInlinerVectorBonusPercent() const { return 150; }
0621
0622   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
0623                                TTI::UnrollingPreferences &UP,
0624                                OptimizationRemarkEmitter *ORE) {
0625     // This unrolling functionality is target independent, but to provide some
0626     // motivation for its intended use, for x86:
0627
0628     // According to the Intel 64 and IA-32 Architectures Optimization Reference
0629     // Manual, Intel Core models and later have a loop stream detector (and
0630     // associated uop queue) that can benefit from partial unrolling.
0631     // The relevant requirements are:
0632     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
0633     //    taken, and none of them may be calls.
0634     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
0635
0636     // According to the Software Optimization Guide for AMD Family 15h
0637     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
0638     // and loop buffer which can benefit from partial unrolling.
0639     // The relevant requirements are:
0640     //  - The loop must have fewer than 16 branches
0641     //  - The loop must have less than 40 uops in all executed loop branches
0642
0643     // The number of taken branches in a loop is hard to estimate here, and
0644     // benchmarking has revealed that it is better not to be conservative when
0645     // estimating the branch count. As a result, we'll ignore the branch limits
0646     // until someone finds a case where it matters in practice.
0647
0648     unsigned MaxOps;
0649     const TargetSubtargetInfo *ST = getST();
0650     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
0651       MaxOps = PartialUnrollingThreshold;
0652     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
0653       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
0654     else
0655       return;
0656
0657     // Scan the loop: don't unroll loops with calls.
0658     for (BasicBlock *BB : L->blocks()) {
0659       for (Instruction &I : *BB) {
0660         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
0661           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
0662             if (!thisT()->isLoweredToCall(F))
0663               continue;
0664           }
0665
0666           if (ORE) {
0667             ORE->emit([&]() {
0668               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
0669                                         L->getHeader())
0670                      << "advising against unrolling the loop because it "
0671                         "contains a "
0672                      << ore::NV("Call", &I);
0673             });
0674           }
0675           return;
0676         }
0677       }
0678     }
0679
0680     // Enable runtime and partial unrolling up to the specified size.
0681     // Enable using trip count upper bound to unroll loops.
0682     UP.Partial = UP.Runtime = UP.UpperBound = true;
0683     UP.PartialThreshold = MaxOps;
0684
0685     // Avoid unrolling when optimizing for size.
0686     UP.OptSizeThreshold = 0;
0687     UP.PartialOptSizeThreshold = 0;
0688
0689     // Set number of instructions optimized when "back edge"
0690     // becomes "fall through" to default value of 2.
0691     UP.BEInsns = 2;
0692   }
0693
0694   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
0695                              TTI::PeelingPreferences &PP) {
0696     PP.PeelCount = 0;
0697     PP.AllowPeeling = true;
0698     PP.AllowLoopNestsPeeling = false;
0699     PP.PeelProfiledIterations = true;
0700   }
0701
0702   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
0703                                 AssumptionCache &AC,
0704                                 TargetLibraryInfo *LibInfo,
0705                                 HardwareLoopInfo &HWLoopInfo) {
0706     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
0707   }
0708
0709   unsigned getEpilogueVectorizationMinVF() {
0710     return BaseT::getEpilogueVectorizationMinVF();
0711   }
0712
0713   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
0714     return BaseT::preferPredicateOverEpilogue(TFI);
0715   }
0716
0717   TailFoldingStyle
0718   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
0719     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
0720   }
0721
0722   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
0723                                                IntrinsicInst &II) {
0724     return BaseT::instCombineIntrinsic(IC, II);
0725   }
0726
0727   std::optional<Value *>
0728   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
0729                                    APInt DemandedMask, KnownBits &Known,
0730                                    bool &KnownBitsComputed) {
0731     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
0732                                                    KnownBitsComputed);
0733   }
0734
0735   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
0736       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
0737       APInt &UndefElts2, APInt &UndefElts3,
0738       std::function<void(Instruction *, unsigned, APInt, APInt &)>
0739           SimplifyAndSetOp) {
0740     return BaseT::simplifyDemandedVectorEltsIntrinsic(
0741         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
0742         SimplifyAndSetOp);
0743   }
0744
0745   virtual std::optional<unsigned>
0746   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
0747     return std::optional<unsigned>(
0748         getST()->getCacheSize(static_cast<unsigned>(Level)));
0749   }
0750
0751   virtual std::optional<unsigned>
0752   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
0753     std::optional<unsigned> TargetResult =
0754         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
0755
0756     if (TargetResult)
0757       return TargetResult;
0758
0759     return BaseT::getCacheAssociativity(Level);
0760   }
0761
0762   virtual unsigned getCacheLineSize() const {
0763     return getST()->getCacheLineSize();
0764   }
0765
0766   virtual unsigned getPrefetchDistance() const {
0767     return getST()->getPrefetchDistance();
0768   }
0769
0770   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
0771                                         unsigned NumStridedMemAccesses,
0772                                         unsigned NumPrefetches,
0773                                         bool HasCall) const {
0774     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
0775                                          NumPrefetches, HasCall);
0776   }
0777
0778   virtual unsigned getMaxPrefetchIterationsAhead() const {
0779     return getST()->getMaxPrefetchIterationsAhead();
0780   }
0781
0782   virtual bool enableWritePrefetching() const {
0783     return getST()->enableWritePrefetching();
0784   }
0785
0786   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
0787     return getST()->shouldPrefetchAddressSpace(AS);
0788   }
0789
0790   /// @}
0791
0792   /// \name Vector TTI Implementations
0793   /// @{
0794
0795   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
0796     return TypeSize::getFixed(32);
0797   }
0798
0799   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
0800   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
0801   bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
0802
0803   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
0804   /// are set if the demanded result elements need to be inserted and/or
0805   /// extracted from vectors.
0806   InstructionCost getScalarizationOverhead(VectorType *InTy,
0807                                            const APInt &DemandedElts,
0808                                            bool Insert, bool Extract,
0809                                            TTI::TargetCostKind CostKind,
0810                                            ArrayRef<Value *> VL = {}) {
0811     /// FIXME: a bitfield is not a reasonable abstraction for talking about
0812     /// which elements are needed from a scalable vector
0813     if (isa<ScalableVectorType>(InTy))
0814       return InstructionCost::getInvalid();
0815     auto *Ty = cast<FixedVectorType>(InTy);
0816
0817     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
0818            (VL.empty() || VL.size() == Ty->getNumElements()) &&
0819            "Vector size mismatch");
0820
0821     InstructionCost Cost = 0;
0822
0823     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
0824       if (!DemandedElts[i])
0825         continue;
0826       if (Insert) {
0827         Value *InsertedVal = VL.empty() ? nullptr : VL[i];
0828         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
0829                                             CostKind, i, nullptr, InsertedVal);
0830       }
0831       if (Extract)
0832         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
0833                                             CostKind, i, nullptr, nullptr);
0834     }
0835
0836     return Cost;
0837   }
0838
0839   bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const {
0840     return false;
0841   }
0842
0843   bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
0844                                           unsigned ScalarOpdIdx) const {
0845     return false;
0846   }
0847
0848   bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
0849                                               int OpdIdx) const {
0850     return OpdIdx == -1;
0851   }
0852
0853   bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
0854                                                         int RetIdx) const {
0855     return RetIdx == 0;
0856   }
0857
0858   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
0859   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
0860                                            bool Extract,
0861                                            TTI::TargetCostKind CostKind) {
0862     if (isa<ScalableVectorType>(InTy))
0863       return InstructionCost::getInvalid();
0864     auto *Ty = cast<FixedVectorType>(InTy);
0865
0866     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
0867     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
0868                                              CostKind);
0869   }
0870
0871   /// Estimate the overhead of scalarizing an instructions unique
0872   /// non-constant operands. The (potentially vector) types to use for each of
0873   /// argument are passes via Tys.
0874   InstructionCost
0875   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
0876                                    ArrayRef<Type *> Tys,
0877                                    TTI::TargetCostKind CostKind) {
0878     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
0879
0880     InstructionCost Cost = 0;
0881     SmallPtrSet<const Value*, 4> UniqueOperands;
0882     for (int I = 0, E = Args.size(); I != E; I++) {
0883       // Disregard things like metadata arguments.
0884       const Value *A = Args[I];
0885       Type *Ty = Tys[I];
0886       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
0887           !Ty->isPtrOrPtrVectorTy())
0888         continue;
0889
0890       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
0891         if (auto *VecTy = dyn_cast<VectorType>(Ty))
0892           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
0893                                            /*Extract*/ true, CostKind);
0894       }
0895     }
0896
0897     return Cost;
0898   }
0899
0900   /// Estimate the overhead of scalarizing the inputs and outputs of an
0901   /// instruction, with return type RetTy and arguments Args of type Tys. If
0902   /// Args are unknown (empty), then the cost associated with one argument is
0903   /// added as a heuristic.
0904   InstructionCost getScalarizationOverhead(VectorType *RetTy,
0905                                            ArrayRef<const Value *> Args,
0906                                            ArrayRef<Type *> Tys,
0907                                            TTI::TargetCostKind CostKind) {
0908     InstructionCost Cost = getScalarizationOverhead(
0909         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
0910     if (!Args.empty())
0911       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
0912     else
0913       // When no information on arguments is provided, we add the cost
0914       // associated with one argument as a heuristic.
0915       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
0916                                        /*Extract*/ true, CostKind);
0917
0918     return Cost;
0919   }
0920
0921   /// Estimate the cost of type-legalization and the legalized type.
0922   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
0923     LLVMContext &C = Ty->getContext();
0924     EVT MTy = getTLI()->getValueType(DL, Ty);
0925
0926     InstructionCost Cost = 1;
0927     // We keep legalizing the type until we find a legal kind. We assume that
0928     // the only operation that costs anything is the split. After splitting
0929     // we need to handle two types.
0930     while (true) {
0931       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
0932
0933       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
0934         // Ensure we return a sensible simple VT here, since many callers of
0935         // this function require it.
0936         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
0937         return std::make_pair(InstructionCost::getInvalid(), VT);
0938       }
0939
0940       if (LK.first == TargetLoweringBase::TypeLegal)
0941         return std::make_pair(Cost, MTy.getSimpleVT());
0942
0943       if (LK.first == TargetLoweringBase::TypeSplitVector ||
0944           LK.first == TargetLoweringBase::TypeExpandInteger)
0945         Cost *= 2;
0946
0947       // Do not loop with f128 type.
0948       if (MTy == LK.second)
0949         return std::make_pair(Cost, MTy.getSimpleVT());
0950
0951       // Keep legalizing the type.
0952       MTy = LK.second;
0953     }
0954   }
0955
0956   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
0957
0958   InstructionCost getArithmeticInstrCost(
0959       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
0960       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
0961       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
0962       ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) {
0963     // Check if any of the operands are vector operands.
0964     const TargetLoweringBase *TLI = getTLI();
0965     int ISD = TLI->InstructionOpcodeToISD(Opcode);
0966     assert(ISD && "Invalid opcode");
0967
0968     // TODO: Handle more cost kinds.
0969     if (CostKind != TTI::TCK_RecipThroughput)
0970       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
0971                                            Opd1Info, Opd2Info,
0972                                            Args, CxtI);
0973
0974     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
0975
0976     bool IsFloat = Ty->isFPOrFPVectorTy();
0977     // Assume that floating point arithmetic operations cost twice as much as
0978     // integer operations.
0979     InstructionCost OpCost = (IsFloat ? 2 : 1);
0980
0981     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
0982       // The operation is legal. Assume it costs 1.
0983       // TODO: Once we have extract/insert subvector cost we need to use them.
0984       return LT.first * OpCost;
0985     }
0986
0987     if (!TLI->isOperationExpand(ISD, LT.second)) {
0988       // If the operation is custom lowered, then assume that the code is twice
0989       // as expensive.
0990       return LT.first * 2 * OpCost;
0991     }
0992
0993     // An 'Expand' of URem and SRem is special because it may default
0994     // to expanding the operation into a sequence of sub-operations
0995     // i.e. X % Y -> X-(X/Y)*Y.
0996     if (ISD == ISD::UREM || ISD == ISD::SREM) {
0997       bool IsSigned = ISD == ISD::SREM;
0998       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
0999                                         LT.second) ||
1000           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1001                                         LT.second)) {
1002         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1003         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1004             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1005         InstructionCost MulCost =
1006             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1007         InstructionCost SubCost =
1008             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1009         return DivCost + MulCost + SubCost;
1010       }
1011     }
1012
1013     // We cannot scalarize scalable vectors, so return Invalid.
1014     if (isa<ScalableVectorType>(Ty))
1015       return InstructionCost::getInvalid();
1016
1017     // Else, assume that we need to scalarize this op.
1018     // TODO: If one of the types get legalized by splitting, handle this
1019     // similarly to what getCastInstrCost() does.
1020     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1021       InstructionCost Cost = thisT()->getArithmeticInstrCost(
1022           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1023           Args, CxtI);
1024       // Return the cost of multiple scalar invocation plus the cost of
1025       // inserting and extracting the values.
1026       SmallVector<Type *> Tys(Args.size(), Ty);
1027       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1028              VTy->getNumElements() * Cost;
1029     }
1030
1031     // We don't know anything about this scalar instruction.
1032     return OpCost;
1033   }
1034
1035   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
1036                                               ArrayRef<int> Mask,
1037                                               VectorType *Ty, int &Index,
1038                                               VectorType *&SubTy) const {
1039     if (Mask.empty())
1040       return Kind;
1041     int NumSrcElts = Ty->getElementCount().getKnownMinValue();
1042     switch (Kind) {
1043     case TTI::SK_PermuteSingleSrc: {
1044       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1045         return TTI::SK_Reverse;
1046       if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1047         return TTI::SK_Broadcast;
1048       if (isSplatMask(Mask, NumSrcElts, Index))
1049         return TTI::SK_Broadcast;
1050       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1051           (Index + Mask.size()) <= (size_t)NumSrcElts) {
1052         SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
1053         return TTI::SK_ExtractSubvector;
1054       }
1055       break;
1056     }
1057     case TTI::SK_PermuteTwoSrc: {
1058       int NumSubElts;
1059       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1060                                  Mask, NumSrcElts, NumSubElts, Index)) {
1061         if (Index + NumSubElts > NumSrcElts)
1062           return Kind;
1063         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1064         return TTI::SK_InsertSubvector;
1065       }
1066       if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1067         return TTI::SK_Select;
1068       if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1069         return TTI::SK_Transpose;
1070       if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1071         return TTI::SK_Splice;
1072       break;
1073     }
1074     case TTI::SK_Select:
1075     case TTI::SK_Reverse:
1076     case TTI::SK_Broadcast:
1077     case TTI::SK_Transpose:
1078     case TTI::SK_InsertSubvector:
1079     case TTI::SK_ExtractSubvector:
1080     case TTI::SK_Splice:
1081       break;
1082     }
1083     return Kind;
1084   }
1085
1086   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1087                                  ArrayRef<int> Mask,
1088                                  TTI::TargetCostKind CostKind, int Index,
1089                                  VectorType *SubTp,
1090                                  ArrayRef<const Value *> Args = {},
1091                                  const Instruction *CxtI = nullptr) {
1092     switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1093     case TTI::SK_Broadcast:
1094       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1095         return getBroadcastShuffleOverhead(FVT, CostKind);
1096       return InstructionCost::getInvalid();
1097     case TTI::SK_Select:
1098     case TTI::SK_Splice:
1099     case TTI::SK_Reverse:
1100     case TTI::SK_Transpose:
1101     case TTI::SK_PermuteSingleSrc:
1102     case TTI::SK_PermuteTwoSrc:
1103       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1104         return getPermuteShuffleOverhead(FVT, CostKind);
1105       return InstructionCost::getInvalid();
1106     case TTI::SK_ExtractSubvector:
1107       return getExtractSubvectorOverhead(Tp, CostKind, Index,
1108                                          cast<FixedVectorType>(SubTp));
1109     case TTI::SK_InsertSubvector:
1110       return getInsertSubvectorOverhead(Tp, CostKind, Index,
1111                                         cast<FixedVectorType>(SubTp));
1112     }
1113     llvm_unreachable("Unknown TTI::ShuffleKind");
1114   }
1115
1116   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1117                                    TTI::CastContextHint CCH,
1118                                    TTI::TargetCostKind CostKind,
1119                                    const Instruction *I = nullptr) {
1120     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1121       return 0;
1122
1123     const TargetLoweringBase *TLI = getTLI();
1124     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1125     assert(ISD && "Invalid opcode");
1126     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1127     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1128
1129     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1130     TypeSize DstSize = DstLT.second.getSizeInBits();
1131     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1132     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1133
1134     switch (Opcode) {
1135     default:
1136       break;
1137     case Instruction::Trunc:
1138       // Check for NOOP conversions.
1139       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1140         return 0;
1141       [[fallthrough]];
1142     case Instruction::BitCast:
1143       // Bitcast between types that are legalized to the same type are free and
1144       // assume int to/from ptr of the same size is also free.
1145       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1146           SrcSize == DstSize)
1147         return 0;
1148       break;
1149     case Instruction::FPExt:
1150       if (I && getTLI()->isExtFree(I))
1151         return 0;
1152       break;
1153     case Instruction::ZExt:
1154       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1155         return 0;
1156       [[fallthrough]];
1157     case Instruction::SExt:
1158       if (I && getTLI()->isExtFree(I))
1159         return 0;
1160
1161       // If this is a zext/sext of a load, return 0 if the corresponding
1162       // extending load exists on target and the result type is legal.
1163       if (CCH == TTI::CastContextHint::Normal) {
1164         EVT ExtVT = EVT::getEVT(Dst);
1165         EVT LoadVT = EVT::getEVT(Src);
1166         unsigned LType =
1167           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1168         if (DstLT.first == SrcLT.first &&
1169             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1170           return 0;
1171       }
1172       break;
1173     case Instruction::AddrSpaceCast:
1174       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1175                                    Dst->getPointerAddressSpace()))
1176         return 0;
1177       break;
1178     }
1179
1180     auto *SrcVTy = dyn_cast<VectorType>(Src);
1181     auto *DstVTy = dyn_cast<VectorType>(Dst);
1182
1183     // If the cast is marked as legal (or promote) then assume low cost.
1184     if (SrcLT.first == DstLT.first &&
1185         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1186       return SrcLT.first;
1187
1188     // Handle scalar conversions.
1189     if (!SrcVTy && !DstVTy) {
1190       // Just check the op cost. If the operation is legal then assume it costs
1191       // 1.
1192       if (!TLI->isOperationExpand(ISD, DstLT.second))
1193         return 1;
1194
1195       // Assume that illegal scalar instruction are expensive.
1196       return 4;
1197     }
1198
1199     // Check vector-to-vector casts.
1200     if (DstVTy && SrcVTy) {
1201       // If the cast is between same-sized registers, then the check is simple.
1202       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1203
1204         // Assume that Zext is done using AND.
1205         if (Opcode == Instruction::ZExt)
1206           return SrcLT.first;
1207
1208         // Assume that sext is done using SHL and SRA.
1209         if (Opcode == Instruction::SExt)
1210           return SrcLT.first * 2;
1211
1212         // Just check the op cost. If the operation is legal then assume it
1213         // costs
1214         // 1 and multiply by the type-legalization overhead.
1215         if (!TLI->isOperationExpand(ISD, DstLT.second))
1216           return SrcLT.first * 1;
1217       }
1218
1219       // If we are legalizing by splitting, query the concrete TTI for the cost
1220       // of casting the original vector twice. We also need to factor in the
1221       // cost of the split itself. Count that as 1, to be consistent with
1222       // getTypeLegalizationCost().
1223       bool SplitSrc =
1224           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1225           TargetLowering::TypeSplitVector;
1226       bool SplitDst =
1227           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1228           TargetLowering::TypeSplitVector;
1229       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1230           DstVTy->getElementCount().isVector()) {
1231         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1232         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1233         T *TTI = static_cast<T *>(this);
1234         // If both types need to be split then the split is free.
1235         InstructionCost SplitCost =
1236             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1237         return SplitCost +
1238                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1239                                           CostKind, I));
1240       }
1241
1242       // Scalarization cost is Invalid, can't assume any num elements.
1243       if (isa<ScalableVectorType>(DstVTy))
1244         return InstructionCost::getInvalid();
1245
1246       // In other cases where the source or destination are illegal, assume
1247       // the operation will get scalarized.
1248       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1249       InstructionCost Cost = thisT()->getCastInstrCost(
1250           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1251
1252       // Return the cost of multiple scalar invocation plus the cost of
1253       // inserting and extracting the values.
1254       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1255                                       CostKind) +
1256              Num * Cost;
1257     }
1258
1259     // We already handled vector-to-vector and scalar-to-scalar conversions.
1260     // This
1261     // is where we handle bitcast between vectors and scalars. We need to assume
1262     //  that the conversion is scalarized in one way or another.
1263     if (Opcode == Instruction::BitCast) {
1264       // Illegal bitcasts are done by storing and loading from a stack slot.
1265       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1266                                                 /*Extract*/ true, CostKind)
1267                      : 0) +
1268              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1269                                                 /*Extract*/ false, CostKind)
1270                      : 0);
1271     }
1272
1273     llvm_unreachable("Unhandled cast");
1274   }
1275
1276   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1277                                            VectorType *VecTy, unsigned Index) {
1278     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1279     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1280                                        CostKind, Index, nullptr, nullptr) +
1281            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1282                                      TTI::CastContextHint::None, CostKind);
1283   }
1284
1285   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1286                                  const Instruction *I = nullptr) {
1287     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1288   }
1289
1290   InstructionCost getCmpSelInstrCost(
1291       unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1292       TTI::TargetCostKind CostKind,
1293       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
1294       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
1295       const Instruction *I = nullptr) {
1296     const TargetLoweringBase *TLI = getTLI();
1297     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1298     assert(ISD && "Invalid opcode");
1299
1300     // TODO: Handle other cost kinds.
1301     if (CostKind != TTI::TCK_RecipThroughput)
1302       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1303                                        Op1Info, Op2Info, I);
1304
1305     // Selects on vectors are actually vector selects.
1306     if (ISD == ISD::SELECT) {
1307       assert(CondTy && "CondTy must exist");
1308       if (CondTy->isVectorTy())
1309         ISD = ISD::VSELECT;
1310     }
1311     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1312
1313     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1314         !TLI->isOperationExpand(ISD, LT.second)) {
1315       // The operation is legal. Assume it costs 1. Multiply
1316       // by the type-legalization overhead.
1317       return LT.first * 1;
1318     }
1319
1320     // Otherwise, assume that the cast is scalarized.
1321     // TODO: If one of the types get legalized by splitting, handle this
1322     // similarly to what getCastInstrCost() does.
1323     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1324       if (isa<ScalableVectorType>(ValTy))
1325         return InstructionCost::getInvalid();
1326
1327       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1328       if (CondTy)
1329         CondTy = CondTy->getScalarType();
1330       InstructionCost Cost =
1331           thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
1332                                       VecPred, CostKind, Op1Info, Op2Info, I);
1333
1334       // Return the cost of multiple scalar invocation plus the cost of
1335       // inserting and extracting the values.
1336       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1337                                       /*Extract*/ false, CostKind) +
1338              Num * Cost;
1339     }
1340
1341     // Unknown scalar opcode.
1342     return 1;
1343   }
1344
1345   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1346                                      TTI::TargetCostKind CostKind,
1347                                      unsigned Index, Value *Op0, Value *Op1) {
1348     return getRegUsageForType(Val->getScalarType());
1349   }
1350
1351   /// \param ScalarUserAndIdx encodes the information about extracts from a
1352   /// vector with 'Scalar' being the value being extracted,'User' being the user
1353   /// of the extract(nullptr if user is not known before vectorization) and
1354   /// 'Idx' being the extract lane.
1355   InstructionCost getVectorInstrCost(
1356       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1357       Value *Scalar,
1358       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1359     return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1360                                        nullptr);
1361   }
1362
1363   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1364                                      TTI::TargetCostKind CostKind,
1365                                      unsigned Index) {
1366     Value *Op0 = nullptr;
1367     Value *Op1 = nullptr;
1368     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1369       Op0 = IE->getOperand(0);
1370       Op1 = IE->getOperand(1);
1371     }
1372     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1373                                        Op1);
1374   }
1375
1376   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1377                                             int VF,
1378                                             const APInt &DemandedDstElts,
1379                                             TTI::TargetCostKind CostKind) {
1380     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1381            "Unexpected size of DemandedDstElts.");
1382
1383     InstructionCost Cost;
1384
1385     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1386     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1387
1388     // The Mask shuffling cost is extract all the elements of the Mask
1389     // and insert each of them Factor times into the wide vector:
1390     //
1391     // E.g. an interleaved group with factor 3:
1392     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1393     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1394     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1395     // The cost is estimated as extract all mask elements from the <8xi1> mask
1396     // vector and insert them factor times into the <24xi1> shuffled mask
1397     // vector.
1398     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1399     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1400                                               /*Insert*/ false,
1401                                               /*Extract*/ true, CostKind);
1402     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1403                                               /*Insert*/ true,
1404                                               /*Extract*/ false, CostKind);
1405
1406     return Cost;
1407   }
1408
1409   InstructionCost
1410   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1411                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1412                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1413                   const Instruction *I = nullptr) {
1414     assert(!Src->isVoidTy() && "Invalid type");
1415     // Assume types, such as structs, are expensive.
1416     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1417       return 4;
1418     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1419
1420     // Assuming that all loads of legal types cost 1.
1421     InstructionCost Cost = LT.first;
1422     if (CostKind != TTI::TCK_RecipThroughput)
1423       return Cost;
1424
1425     const DataLayout &DL = this->getDataLayout();
1426     if (Src->isVectorTy() &&
1427         // In practice it's not currently possible to have a change in lane
1428         // length for extending loads or truncating stores so both types should
1429         // have the same scalable property.
1430         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1431                             LT.second.getSizeInBits())) {
1432       // This is a vector load that legalizes to a larger type than the vector
1433       // itself. Unless the corresponding extending load or truncating store is
1434       // legal, then this will scalarize.
1435       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1436       EVT MemVT = getTLI()->getValueType(DL, Src);
1437       if (Opcode == Instruction::Store)
1438         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1439       else
1440         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1441
1442       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1443         // This is a vector load/store for some illegal type that is scalarized.
1444         // We must account for the cost of building or decomposing the vector.
1445         Cost += getScalarizationOverhead(
1446             cast<VectorType>(Src), Opcode != Instruction::Store,
1447             Opcode == Instruction::Store, CostKind);
1448       }
1449     }
1450
1451     return Cost;
1452   }
1453
1454   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1455                                         Align Alignment, unsigned AddressSpace,
1456                                         TTI::TargetCostKind CostKind) {
1457     // TODO: Pass on AddressSpace when we have test coverage.
1458     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1459                                        CostKind);
1460   }
1461
1462   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1463                                          const Value *Ptr, bool VariableMask,
1464                                          Align Alignment,
1465                                          TTI::TargetCostKind CostKind,
1466                                          const Instruction *I = nullptr) {
1467     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1468                                        true, CostKind);
1469   }
1470
1471   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1472                                          const Value *Ptr, bool VariableMask,
1473                                          Align Alignment,
1474                                          TTI::TargetCostKind CostKind,
1475                                          const Instruction *I) {
1476     // For a target without strided memory operations (or for an illegal
1477     // operation type on one which does), assume we lower to a gather/scatter
1478     // operation.  (Which may in turn be scalarized.)
1479     return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1480                                            Alignment, CostKind, I);
1481   }
1482
1483   InstructionCost getInterleavedMemoryOpCost(
1484       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1485       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1486       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1487
1488     // We cannot scalarize scalable vectors, so return Invalid.
1489     if (isa<ScalableVectorType>(VecTy))
1490       return InstructionCost::getInvalid();
1491
1492     auto *VT = cast<FixedVectorType>(VecTy);
1493
1494     unsigned NumElts = VT->getNumElements();
1495     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1496
1497     unsigned NumSubElts = NumElts / Factor;
1498     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1499
1500     // Firstly, the cost of load/store operation.
1501     InstructionCost Cost;
1502     if (UseMaskForCond || UseMaskForGaps)
1503       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1504                                             AddressSpace, CostKind);
1505     else
1506       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1507                                       CostKind);
1508
1509     // Legalize the vector type, and get the legalized and unlegalized type
1510     // sizes.
1511     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1512     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1513     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1514
1515     // Scale the cost of the memory operation by the fraction of legalized
1516     // instructions that will actually be used. We shouldn't account for the
1517     // cost of dead instructions since they will be removed.
1518     //
1519     // E.g., An interleaved load of factor 8:
1520     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1521     //       %v0 = shufflevector %vec, undef, <0, 8>
1522     //
1523     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1524     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1525     // type). The other loads are unused.
1526     //
1527     // TODO: Note that legalization can turn masked loads/stores into unmasked
1528     // (legalized) loads/stores. This can be reflected in the cost.
1529     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1530       // The number of loads of a legal type it will take to represent a load
1531       // of the unlegalized vector type.
1532       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1533
1534       // The number of elements of the unlegalized type that correspond to a
1535       // single legal instruction.
1536       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1537
1538       // Determine which legal instructions will be used.
1539       BitVector UsedInsts(NumLegalInsts, false);
1540       for (unsigned Index : Indices)
1541         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1542           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1543
1544       // Scale the cost of the load by the fraction of legal instructions that
1545       // will be used.
1546       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1547     }
1548
1549     // Then plus the cost of interleave operation.
1550     assert(Indices.size() <= Factor &&
1551            "Interleaved memory op has too many members");
1552
1553     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1554     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1555
1556     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1557     for (unsigned Index : Indices) {
1558       assert(Index < Factor && "Invalid index for interleaved memory op");
1559       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1560         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1561     }
1562
1563     if (Opcode == Instruction::Load) {
1564       // The interleave cost is similar to extract sub vectors' elements
1565       // from the wide vector, and insert them into sub vectors.
1566       //
1567       // E.g. An interleaved load of factor 2 (with one member of index 0):
1568       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1569       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1570       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1571       // <8 x i32> vector and insert them into a <4 x i32> vector.
1572       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1573           SubVT, DemandedAllSubElts,
1574           /*Insert*/ true, /*Extract*/ false, CostKind);
1575       Cost += Indices.size() * InsSubCost;
1576       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1577                                                 /*Insert*/ false,
1578                                                 /*Extract*/ true, CostKind);
1579     } else {
1580       // The interleave cost is extract elements from sub vectors, and
1581       // insert them into the wide vector.
1582       //
1583       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1584       // (using VF=4):
1585       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1586       //    %gaps.mask = <true, true, false, true, true, false,
1587       //                  true, true, false, true, true, false>
1588       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1589       //                           i32 Align, <12 x i1> %gaps.mask
1590       // The cost is estimated as extract all elements (of actual members,
1591       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1592       // i32> vector.
1593       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1594           SubVT, DemandedAllSubElts,
1595           /*Insert*/ false, /*Extract*/ true, CostKind);
1596       Cost += ExtSubCost * Indices.size();
1597       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1598                                                 /*Insert*/ true,
1599                                                 /*Extract*/ false, CostKind);
1600     }
1601
1602     if (!UseMaskForCond)
1603       return Cost;
1604
1605     Type *I8Type = Type::getInt8Ty(VT->getContext());
1606
1607     Cost += thisT()->getReplicationShuffleCost(
1608         I8Type, Factor, NumSubElts,
1609         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1610         CostKind);
1611
1612     // The Gaps mask is invariant and created outside the loop, therefore the
1613     // cost of creating it is not accounted for here. However if we have both
1614     // a MaskForGaps and some other mask that guards the execution of the
1615     // memory access, we need to account for the cost of And-ing the two masks
1616     // inside the loop.
1617     if (UseMaskForGaps) {
1618       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1619       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1620                                               CostKind);
1621     }
1622
1623     return Cost;
1624   }
1625
1626   /// Get intrinsic cost based on arguments.
1627   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1628                                         TTI::TargetCostKind CostKind) {
1629     // Check for generically free intrinsics.
1630     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1631       return 0;
1632
1633     // Assume that target intrinsics are cheap.
1634     Intrinsic::ID IID = ICA.getID();
1635     if (Intrinsic::isTargetIntrinsic(IID))
1636       return TargetTransformInfo::TCC_Basic;
1637
1638     // VP Intrinsics should have the same cost as their non-vp counterpart.
1639     // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1640     // counterpart when the vector length argument is smaller than the maximum
1641     // vector length.
1642     // TODO: Support other kinds of VPIntrinsics
1643     if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1644       std::optional<unsigned> FOp =
1645           VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1646       if (FOp) {
1647         if (ICA.getID() == Intrinsic::vp_load) {
1648           Align Alignment;
1649           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1650             Alignment = VPI->getPointerAlignment().valueOrOne();
1651           unsigned AS = 0;
1652           if (ICA.getArgTypes().size() > 1)
1653             if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1654               AS = PtrTy->getAddressSpace();
1655           return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1656                                           AS, CostKind);
1657         }
1658         if (ICA.getID() == Intrinsic::vp_store) {
1659           Align Alignment;
1660           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1661             Alignment = VPI->getPointerAlignment().valueOrOne();
1662           unsigned AS = 0;
1663           if (ICA.getArgTypes().size() >= 2)
1664             if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1665               AS = PtrTy->getAddressSpace();
1666           return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1667                                           AS, CostKind);
1668         }
1669         if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
1670           return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1671                                                  CostKind);
1672         }
1673         if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1674           return thisT()->getCastInstrCost(
1675               *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1676               TTI::CastContextHint::None, CostKind);
1677         }
1678         if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1679           // We can only handle vp_cmp intrinsics with underlying instructions.
1680           if (ICA.getInst()) {
1681             assert(FOp);
1682             auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1683             return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1684                                                ICA.getReturnType(),
1685                                                UI->getPredicate(), CostKind);
1686           }
1687         }
1688       }
1689
1690       std::optional<Intrinsic::ID> FID =
1691           VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
1692       if (FID) {
1693         // Non-vp version will have same arg types except mask and vector
1694         // length.
1695         assert(ICA.getArgTypes().size() >= 2 &&
1696                "Expected VPIntrinsic to have Mask and Vector Length args and "
1697                "types");
1698         ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
1699
1700         // VPReduction intrinsics have a start value argument that their non-vp
1701         // counterparts do not have, except for the fadd and fmul non-vp
1702         // counterpart.
1703         if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
1704             *FID != Intrinsic::vector_reduce_fadd &&
1705             *FID != Intrinsic::vector_reduce_fmul)
1706           NewTys = NewTys.drop_front();
1707
1708         IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1709                                        ICA.getFlags());
1710         return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1711       }
1712     }
1713
1714     if (ICA.isTypeBasedOnly())
1715       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1716
1717     Type *RetTy = ICA.getReturnType();
1718
1719     ElementCount RetVF =
1720         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1721                              : ElementCount::getFixed(1));
1722     const IntrinsicInst *I = ICA.getInst();
1723     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1724     FastMathFlags FMF = ICA.getFlags();
1725     switch (IID) {
1726     default:
1727       break;
1728
1729     case Intrinsic::powi:
1730       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1731         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1732         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1733                                                ShouldOptForSize)) {
1734           // The cost is modeled on the expansion performed by ExpandPowI in
1735           // SelectionDAGBuilder.
1736           APInt Exponent = RHSC->getValue().abs();
1737           unsigned ActiveBits = Exponent.getActiveBits();
1738           unsigned PopCount = Exponent.popcount();
1739           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1740                                  thisT()->getArithmeticInstrCost(
1741                                      Instruction::FMul, RetTy, CostKind);
1742           if (RHSC->isNegative())
1743             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1744                                                     CostKind);
1745           return Cost;
1746         }
1747       }
1748       break;
1749     case Intrinsic::cttz:
1750       // FIXME: If necessary, this should go in target-specific overrides.
1751       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1752         return TargetTransformInfo::TCC_Basic;
1753       break;
1754
1755     case Intrinsic::ctlz:
1756       // FIXME: If necessary, this should go in target-specific overrides.
1757       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1758         return TargetTransformInfo::TCC_Basic;
1759       break;
1760
1761     case Intrinsic::memcpy:
1762       return thisT()->getMemcpyCost(ICA.getInst());
1763
1764     case Intrinsic::masked_scatter: {
1765       const Value *Mask = Args[3];
1766       bool VarMask = !isa<Constant>(Mask);
1767       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1768       return thisT()->getGatherScatterOpCost(Instruction::Store,
1769                                              ICA.getArgTypes()[0], Args[1],
1770                                              VarMask, Alignment, CostKind, I);
1771     }
1772     case Intrinsic::masked_gather: {
1773       const Value *Mask = Args[2];
1774       bool VarMask = !isa<Constant>(Mask);
1775       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1776       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1777                                              VarMask, Alignment, CostKind, I);
1778     }
1779     case Intrinsic::experimental_vp_strided_store: {
1780       const Value *Data = Args[0];
1781       const Value *Ptr = Args[1];
1782       const Value *Mask = Args[3];
1783       const Value *EVL = Args[4];
1784       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1785       Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1786       Align Alignment =
1787           I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1788       return thisT()->getStridedMemoryOpCost(Instruction::Store,
1789                                              Data->getType(), Ptr, VarMask,
1790                                              Alignment, CostKind, I);
1791     }
1792     case Intrinsic::experimental_vp_strided_load: {
1793       const Value *Ptr = Args[0];
1794       const Value *Mask = Args[2];
1795       const Value *EVL = Args[3];
1796       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1797       Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1798       Align Alignment =
1799           I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1800       return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1801                                              VarMask, Alignment, CostKind, I);
1802     }
1803     case Intrinsic::stepvector: {
1804       if (isa<ScalableVectorType>(RetTy))
1805         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1806       // The cost of materialising a constant integer vector.
1807       return TargetTransformInfo::TCC_Basic;
1808     }
1809     case Intrinsic::vector_extract: {
1810       // FIXME: Handle case where a scalable vector is extracted from a scalable
1811       // vector
1812       if (isa<ScalableVectorType>(RetTy))
1813         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1814       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1815       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1816                                      cast<VectorType>(Args[0]->getType()), {},
1817                                      CostKind, Index, cast<VectorType>(RetTy));
1818     }
1819     case Intrinsic::vector_insert: {
1820       // FIXME: Handle case where a scalable vector is inserted into a scalable
1821       // vector
1822       if (isa<ScalableVectorType>(Args[1]->getType()))
1823         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1824       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1825       return thisT()->getShuffleCost(
1826           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
1827           CostKind, Index, cast<VectorType>(Args[1]->getType()));
1828     }
1829     case Intrinsic::vector_reverse: {
1830       return thisT()->getShuffleCost(TTI::SK_Reverse,
1831                                      cast<VectorType>(Args[0]->getType()), {},
1832                                      CostKind, 0, cast<VectorType>(RetTy));
1833     }
1834     case Intrinsic::vector_splice: {
1835       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1836       return thisT()->getShuffleCost(TTI::SK_Splice,
1837                                      cast<VectorType>(Args[0]->getType()), {},
1838                                      CostKind, Index, cast<VectorType>(RetTy));
1839     }
1840     case Intrinsic::vector_reduce_add:
1841     case Intrinsic::vector_reduce_mul:
1842     case Intrinsic::vector_reduce_and:
1843     case Intrinsic::vector_reduce_or:
1844     case Intrinsic::vector_reduce_xor:
1845     case Intrinsic::vector_reduce_smax:
1846     case Intrinsic::vector_reduce_smin:
1847     case Intrinsic::vector_reduce_fmax:
1848     case Intrinsic::vector_reduce_fmin:
1849     case Intrinsic::vector_reduce_fmaximum:
1850     case Intrinsic::vector_reduce_fminimum:
1851     case Intrinsic::vector_reduce_umax:
1852     case Intrinsic::vector_reduce_umin: {
1853       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1854       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1855     }
1856     case Intrinsic::vector_reduce_fadd:
1857     case Intrinsic::vector_reduce_fmul: {
1858       IntrinsicCostAttributes Attrs(
1859           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1860       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1861     }
1862     case Intrinsic::fshl:
1863     case Intrinsic::fshr: {
1864       const Value *X = Args[0];
1865       const Value *Y = Args[1];
1866       const Value *Z = Args[2];
1867       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1868       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1869       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1870       const TTI::OperandValueInfo OpInfoBW =
1871         {TTI::OK_UniformConstantValue,
1872          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1873          : TTI::OP_None};
1874
1875       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1876       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1877       InstructionCost Cost = 0;
1878       Cost +=
1879           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1880       Cost +=
1881           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1882       Cost += thisT()->getArithmeticInstrCost(
1883           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1884           {OpInfoZ.Kind, TTI::OP_None});
1885       Cost += thisT()->getArithmeticInstrCost(
1886           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1887           {OpInfoZ.Kind, TTI::OP_None});
1888       // Non-constant shift amounts requires a modulo.
1889       if (!OpInfoZ.isConstant())
1890         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1891                                                 CostKind, OpInfoZ, OpInfoBW);
1892       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1893       if (X != Y) {
1894         Type *CondTy = RetTy->getWithNewBitWidth(1);
1895         Cost +=
1896             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1897                                         CmpInst::ICMP_EQ, CostKind);
1898         Cost +=
1899             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1900                                         CmpInst::ICMP_EQ, CostKind);
1901       }
1902       return Cost;
1903     }
1904     case Intrinsic::get_active_lane_mask: {
1905       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1906       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1907
1908       // If we're not expanding the intrinsic then we assume this is cheap
1909       // to implement.
1910       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1911         return getTypeLegalizationCost(RetTy).first;
1912       }
1913
1914       // Create the expanded types that will be used to calculate the uadd_sat
1915       // operation.
1916       Type *ExpRetTy = VectorType::get(
1917           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1918       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1919       InstructionCost Cost =
1920           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1921       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1922                                           CmpInst::ICMP_ULT, CostKind);
1923       return Cost;
1924     }
1925     case Intrinsic::experimental_cttz_elts: {
1926       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1927
1928       // If we're not expanding the intrinsic then we assume this is cheap
1929       // to implement.
1930       if (!getTLI()->shouldExpandCttzElements(ArgType))
1931         return getTypeLegalizationCost(RetTy).first;
1932
1933       // TODO: The costs below reflect the expansion code in
1934       // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1935       // favour of compile time.
1936
1937       // Find the smallest "sensible" element type to use for the expansion.
1938       bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1939       ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1940       if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1941         VScaleRange = getVScaleRange(I->getCaller(), 64);
1942
1943       unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1944           RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1945       Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1946
1947       // Create the new vector type & get the vector length
1948       Type *NewVecTy = VectorType::get(
1949           NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1950
1951       IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
1952                                            FMF);
1953       InstructionCost Cost =
1954           thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1955
1956       Cost +=
1957           thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1958       Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1959                                         Args[0]->getType(),
1960                                         TTI::CastContextHint::None, CostKind);
1961       Cost +=
1962           thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1963
1964       IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1965                                          NewEltTy, NewVecTy, FMF, I, 1);
1966       Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1967       Cost +=
1968           thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1969
1970       return Cost;
1971     }
1972     case Intrinsic::experimental_vector_match:
1973       return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1974     }
1975
1976     // Assume that we need to scalarize this intrinsic.)
1977     // Compute the scalarization overhead based on Args for a vector
1978     // intrinsic.
1979     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1980     if (RetVF.isVector() && !RetVF.isScalable()) {
1981       ScalarizationCost = 0;
1982       if (!RetTy->isVoidTy())
1983         ScalarizationCost += getScalarizationOverhead(
1984             cast<VectorType>(RetTy),
1985             /*Insert*/ true, /*Extract*/ false, CostKind);
1986       ScalarizationCost +=
1987           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1988     }
1989
1990     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1991                                   ScalarizationCost);
1992     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1993   }
1994
1995   /// Get intrinsic cost based on argument types.
1996   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1997   /// cost of scalarizing the arguments and the return value will be computed
1998   /// based on types.
1999   InstructionCost
2000   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2001                                  TTI::TargetCostKind CostKind) {
2002     Intrinsic::ID IID = ICA.getID();
2003     Type *RetTy = ICA.getReturnType();
2004     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2005     FastMathFlags FMF = ICA.getFlags();
2006     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2007     bool SkipScalarizationCost = ICA.skipScalarizationCost();
2008
2009     VectorType *VecOpTy = nullptr;
2010     if (!Tys.empty()) {
2011       // The vector reduction operand is operand 0 except for fadd/fmul.
2012       // Their operand 0 is a scalar start value, so the vector op is operand 1.
2013       unsigned VecTyIndex = 0;
2014       if (IID == Intrinsic::vector_reduce_fadd ||
2015           IID == Intrinsic::vector_reduce_fmul)
2016         VecTyIndex = 1;
2017       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2018       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2019     }
2020
2021     // Library call cost - other than size, make it expensive.
2022     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2023     unsigned ISD = 0;
2024     switch (IID) {
2025     default: {
2026       // Scalable vectors cannot be scalarized, so return Invalid.
2027       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2028             return isa<ScalableVectorType>(Ty);
2029           }))
2030         return InstructionCost::getInvalid();
2031
2032       // Assume that we need to scalarize this intrinsic.
2033       InstructionCost ScalarizationCost =
2034           SkipScalarizationCost ? ScalarizationCostPassed : 0;
2035       unsigned ScalarCalls = 1;
2036       Type *ScalarRetTy = RetTy;
2037       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2038         if (!SkipScalarizationCost)
2039           ScalarizationCost = getScalarizationOverhead(
2040               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2041         ScalarCalls = std::max(ScalarCalls,
2042                                cast<FixedVectorType>(RetVTy)->getNumElements());
2043         ScalarRetTy = RetTy->getScalarType();
2044       }
2045       SmallVector<Type *, 4> ScalarTys;
2046       for (Type *Ty : Tys) {
2047         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2048           if (!SkipScalarizationCost)
2049             ScalarizationCost += getScalarizationOverhead(
2050                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2051           ScalarCalls = std::max(ScalarCalls,
2052                                  cast<FixedVectorType>(VTy)->getNumElements());
2053           Ty = Ty->getScalarType();
2054         }
2055         ScalarTys.push_back(Ty);
2056       }
2057       if (ScalarCalls == 1)
2058         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2059
2060       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2061       InstructionCost ScalarCost =
2062           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2063
2064       return ScalarCalls * ScalarCost + ScalarizationCost;
2065     }
2066     // Look for intrinsics that can be lowered directly or turned into a scalar
2067     // intrinsic call.
2068     case Intrinsic::sqrt:
2069       ISD = ISD::FSQRT;
2070       break;
2071     case Intrinsic::sin:
2072       ISD = ISD::FSIN;
2073       break;
2074     case Intrinsic::cos:
2075       ISD = ISD::FCOS;
2076       break;
2077     case Intrinsic::sincos:
2078       ISD = ISD::FSINCOS;
2079       break;
2080     case Intrinsic::tan:
2081       ISD = ISD::FTAN;
2082       break;
2083     case Intrinsic::asin:
2084       ISD = ISD::FASIN;
2085       break;
2086     case Intrinsic::acos:
2087       ISD = ISD::FACOS;
2088       break;
2089     case Intrinsic::atan:
2090       ISD = ISD::FATAN;
2091       break;
2092     case Intrinsic::atan2:
2093       ISD = ISD::FATAN2;
2094       break;
2095     case Intrinsic::sinh:
2096       ISD = ISD::FSINH;
2097       break;
2098     case Intrinsic::cosh:
2099       ISD = ISD::FCOSH;
2100       break;
2101     case Intrinsic::tanh:
2102       ISD = ISD::FTANH;
2103       break;
2104     case Intrinsic::exp:
2105       ISD = ISD::FEXP;
2106       break;
2107     case Intrinsic::exp2:
2108       ISD = ISD::FEXP2;
2109       break;
2110     case Intrinsic::exp10:
2111       ISD = ISD::FEXP10;
2112       break;
2113     case Intrinsic::log:
2114       ISD = ISD::FLOG;
2115       break;
2116     case Intrinsic::log10:
2117       ISD = ISD::FLOG10;
2118       break;
2119     case Intrinsic::log2:
2120       ISD = ISD::FLOG2;
2121       break;
2122     case Intrinsic::fabs:
2123       ISD = ISD::FABS;
2124       break;
2125     case Intrinsic::canonicalize:
2126       ISD = ISD::FCANONICALIZE;
2127       break;
2128     case Intrinsic::minnum:
2129       ISD = ISD::FMINNUM;
2130       break;
2131     case Intrinsic::maxnum:
2132       ISD = ISD::FMAXNUM;
2133       break;
2134     case Intrinsic::minimum:
2135       ISD = ISD::FMINIMUM;
2136       break;
2137     case Intrinsic::maximum:
2138       ISD = ISD::FMAXIMUM;
2139       break;
2140     case Intrinsic::minimumnum:
2141       ISD = ISD::FMINIMUMNUM;
2142       break;
2143     case Intrinsic::maximumnum:
2144       ISD = ISD::FMAXIMUMNUM;
2145       break;
2146     case Intrinsic::copysign:
2147       ISD = ISD::FCOPYSIGN;
2148       break;
2149     case Intrinsic::floor:
2150       ISD = ISD::FFLOOR;
2151       break;
2152     case Intrinsic::ceil:
2153       ISD = ISD::FCEIL;
2154       break;
2155     case Intrinsic::trunc:
2156       ISD = ISD::FTRUNC;
2157       break;
2158     case Intrinsic::nearbyint:
2159       ISD = ISD::FNEARBYINT;
2160       break;
2161     case Intrinsic::rint:
2162       ISD = ISD::FRINT;
2163       break;
2164     case Intrinsic::lrint:
2165       ISD = ISD::LRINT;
2166       break;
2167     case Intrinsic::llrint:
2168       ISD = ISD::LLRINT;
2169       break;
2170     case Intrinsic::round:
2171       ISD = ISD::FROUND;
2172       break;
2173     case Intrinsic::roundeven:
2174       ISD = ISD::FROUNDEVEN;
2175       break;
2176     case Intrinsic::pow:
2177       ISD = ISD::FPOW;
2178       break;
2179     case Intrinsic::fma:
2180       ISD = ISD::FMA;
2181       break;
2182     case Intrinsic::fmuladd:
2183       ISD = ISD::FMA;
2184       break;
2185     case Intrinsic::experimental_constrained_fmuladd:
2186       ISD = ISD::STRICT_FMA;
2187       break;
2188     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2189     case Intrinsic::lifetime_start:
2190     case Intrinsic::lifetime_end:
2191     case Intrinsic::sideeffect:
2192     case Intrinsic::pseudoprobe:
2193     case Intrinsic::arithmetic_fence:
2194       return 0;
2195     case Intrinsic::masked_store: {
2196       Type *Ty = Tys[0];
2197       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2198       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2199                                             CostKind);
2200     }
2201     case Intrinsic::masked_load: {
2202       Type *Ty = RetTy;
2203       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2204       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2205                                             CostKind);
2206     }
2207     case Intrinsic::vector_reduce_add:
2208     case Intrinsic::vector_reduce_mul:
2209     case Intrinsic::vector_reduce_and:
2210     case Intrinsic::vector_reduce_or:
2211     case Intrinsic::vector_reduce_xor:
2212       return thisT()->getArithmeticReductionCost(
2213           getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2214           CostKind);
2215     case Intrinsic::vector_reduce_fadd:
2216     case Intrinsic::vector_reduce_fmul:
2217       return thisT()->getArithmeticReductionCost(
2218           getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2219     case Intrinsic::vector_reduce_smax:
2220     case Intrinsic::vector_reduce_smin:
2221     case Intrinsic::vector_reduce_umax:
2222     case Intrinsic::vector_reduce_umin:
2223     case Intrinsic::vector_reduce_fmax:
2224     case Intrinsic::vector_reduce_fmin:
2225     case Intrinsic::vector_reduce_fmaximum:
2226     case Intrinsic::vector_reduce_fminimum:
2227       return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2228                                              VecOpTy, ICA.getFlags(), CostKind);
2229     case Intrinsic::experimental_vector_match: {
2230       auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2231       auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2232       unsigned SearchSize = NeedleTy->getNumElements();
2233
2234       // If we're not expanding the intrinsic then we assume this is cheap to
2235       // implement.
2236       EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2237       if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2238         return getTypeLegalizationCost(RetTy).first;
2239
2240       // Approximate the cost based on the expansion code in
2241       // SelectionDAGBuilder.
2242       InstructionCost Cost = 0;
2243       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2244                                           CostKind, 1, nullptr, nullptr);
2245       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2246                                           CostKind, 0, nullptr, nullptr);
2247       Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
2248                                       CostKind, 0, nullptr);
2249       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2250                                           CmpInst::ICMP_EQ, CostKind);
2251       Cost +=
2252           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2253       Cost *= SearchSize;
2254       Cost +=
2255           thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2256       return Cost;
2257     }
2258     case Intrinsic::abs:
2259       ISD = ISD::ABS;
2260       break;
2261     case Intrinsic::fshl:
2262       ISD = ISD::FSHL;
2263       break;
2264     case Intrinsic::fshr:
2265       ISD = ISD::FSHR;
2266       break;
2267     case Intrinsic::smax:
2268       ISD = ISD::SMAX;
2269       break;
2270     case Intrinsic::smin:
2271       ISD = ISD::SMIN;
2272       break;
2273     case Intrinsic::umax:
2274       ISD = ISD::UMAX;
2275       break;
2276     case Intrinsic::umin:
2277       ISD = ISD::UMIN;
2278       break;
2279     case Intrinsic::sadd_sat:
2280       ISD = ISD::SADDSAT;
2281       break;
2282     case Intrinsic::ssub_sat:
2283       ISD = ISD::SSUBSAT;
2284       break;
2285     case Intrinsic::uadd_sat:
2286       ISD = ISD::UADDSAT;
2287       break;
2288     case Intrinsic::usub_sat:
2289       ISD = ISD::USUBSAT;
2290       break;
2291     case Intrinsic::smul_fix:
2292       ISD = ISD::SMULFIX;
2293       break;
2294     case Intrinsic::umul_fix:
2295       ISD = ISD::UMULFIX;
2296       break;
2297     case Intrinsic::sadd_with_overflow:
2298       ISD = ISD::SADDO;
2299       break;
2300     case Intrinsic::ssub_with_overflow:
2301       ISD = ISD::SSUBO;
2302       break;
2303     case Intrinsic::uadd_with_overflow:
2304       ISD = ISD::UADDO;
2305       break;
2306     case Intrinsic::usub_with_overflow:
2307       ISD = ISD::USUBO;
2308       break;
2309     case Intrinsic::smul_with_overflow:
2310       ISD = ISD::SMULO;
2311       break;
2312     case Intrinsic::umul_with_overflow:
2313       ISD = ISD::UMULO;
2314       break;
2315     case Intrinsic::fptosi_sat:
2316       ISD = ISD::FP_TO_SINT_SAT;
2317       break;
2318     case Intrinsic::fptoui_sat:
2319       ISD = ISD::FP_TO_UINT_SAT;
2320       break;
2321     case Intrinsic::ctpop:
2322       ISD = ISD::CTPOP;
2323       // In case of legalization use TCC_Expensive. This is cheaper than a
2324       // library call but still not a cheap instruction.
2325       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2326       break;
2327     case Intrinsic::ctlz:
2328       ISD = ISD::CTLZ;
2329       break;
2330     case Intrinsic::cttz:
2331       ISD = ISD::CTTZ;
2332       break;
2333     case Intrinsic::bswap:
2334       ISD = ISD::BSWAP;
2335       break;
2336     case Intrinsic::bitreverse:
2337       ISD = ISD::BITREVERSE;
2338       break;
2339     case Intrinsic::ucmp:
2340       ISD = ISD::UCMP;
2341       break;
2342     case Intrinsic::scmp:
2343       ISD = ISD::SCMP;
2344       break;
2345     }
2346
2347     auto *ST = dyn_cast<StructType>(RetTy);
2348     Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2349     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2350
2351     const TargetLoweringBase *TLI = getTLI();
2352
2353     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2354       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2355           TLI->isFAbsFree(LT.second)) {
2356         return 0;
2357       }
2358
2359       // The operation is legal. Assume it costs 1.
2360       // If the type is split to multiple registers, assume that there is some
2361       // overhead to this.
2362       // TODO: Once we have extract/insert subvector cost we need to use them.
2363       if (LT.first > 1)
2364         return (LT.first * 2);
2365       else
2366         return (LT.first * 1);
2367     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2368       // If the operation is custom lowered then assume
2369       // that the code is twice as expensive.
2370       return (LT.first * 2);
2371     }
2372
2373     switch (IID) {
2374     case Intrinsic::fmuladd: {
2375       // If we can't lower fmuladd into an FMA estimate the cost as a floating
2376       // point mul followed by an add.
2377
2378       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2379                                              CostKind) +
2380              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2381                                              CostKind);
2382     }
2383     case Intrinsic::experimental_constrained_fmuladd: {
2384       IntrinsicCostAttributes FMulAttrs(
2385         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2386       IntrinsicCostAttributes FAddAttrs(
2387         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2388       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2389              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2390     }
2391     case Intrinsic::smin:
2392     case Intrinsic::smax:
2393     case Intrinsic::umin:
2394     case Intrinsic::umax: {
2395       // minmax(X,Y) = select(icmp(X,Y),X,Y)
2396       Type *CondTy = RetTy->getWithNewBitWidth(1);
2397       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2398       CmpInst::Predicate Pred =
2399           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2400       InstructionCost Cost = 0;
2401       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2402                                           Pred, CostKind);
2403       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2404                                           Pred, CostKind);
2405       return Cost;
2406     }
2407     case Intrinsic::sadd_with_overflow:
2408     case Intrinsic::ssub_with_overflow: {
2409       Type *SumTy = RetTy->getContainedType(0);
2410       Type *OverflowTy = RetTy->getContainedType(1);
2411       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2412                             ? BinaryOperator::Add
2413                             : BinaryOperator::Sub;
2414
2415       //   Add:
2416       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2417       //   Sub:
2418       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2419       InstructionCost Cost = 0;
2420       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2421       Cost +=
2422           2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2423                                           CmpInst::ICMP_SGT, CostKind);
2424       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2425                                               CostKind);
2426       return Cost;
2427     }
2428     case Intrinsic::uadd_with_overflow:
2429     case Intrinsic::usub_with_overflow: {
2430       Type *SumTy = RetTy->getContainedType(0);
2431       Type *OverflowTy = RetTy->getContainedType(1);
2432       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2433                             ? BinaryOperator::Add
2434                             : BinaryOperator::Sub;
2435       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2436                                     ? CmpInst::ICMP_ULT
2437                                     : CmpInst::ICMP_UGT;
2438
2439       InstructionCost Cost = 0;
2440       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2441       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2442                                           OverflowTy, Pred, CostKind);
2443       return Cost;
2444     }
2445     case Intrinsic::smul_with_overflow:
2446     case Intrinsic::umul_with_overflow: {
2447       Type *MulTy = RetTy->getContainedType(0);
2448       Type *OverflowTy = RetTy->getContainedType(1);
2449       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2450       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2451       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2452
2453       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2454       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2455
2456       InstructionCost Cost = 0;
2457       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2458       Cost +=
2459           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2460       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2461                                             CCH, CostKind);
2462       Cost += thisT()->getArithmeticInstrCost(
2463           Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2464           {TTI::OK_UniformConstantValue, TTI::OP_None});
2465
2466       if (IsSigned)
2467         Cost += thisT()->getArithmeticInstrCost(
2468             Instruction::AShr, MulTy, CostKind,
2469             {TTI::OK_AnyValue, TTI::OP_None},
2470             {TTI::OK_UniformConstantValue, TTI::OP_None});
2471
2472       Cost += thisT()->getCmpSelInstrCost(
2473           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2474       return Cost;
2475     }
2476     case Intrinsic::sadd_sat:
2477     case Intrinsic::ssub_sat: {
2478       // Assume a default expansion.
2479       Type *CondTy = RetTy->getWithNewBitWidth(1);
2480
2481       Type *OpTy = StructType::create({RetTy, CondTy});
2482       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2483                                      ? Intrinsic::sadd_with_overflow
2484                                      : Intrinsic::ssub_with_overflow;
2485       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2486
2487       // SatMax -> Overflow && SumDiff < 0
2488       // SatMin -> Overflow && SumDiff >= 0
2489       InstructionCost Cost = 0;
2490       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2491                                     nullptr, ScalarizationCostPassed);
2492       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2493       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2494                                           Pred, CostKind);
2495       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2496                                               CondTy, Pred, CostKind);
2497       return Cost;
2498     }
2499     case Intrinsic::uadd_sat:
2500     case Intrinsic::usub_sat: {
2501       Type *CondTy = RetTy->getWithNewBitWidth(1);
2502
2503       Type *OpTy = StructType::create({RetTy, CondTy});
2504       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2505                                      ? Intrinsic::uadd_with_overflow
2506                                      : Intrinsic::usub_with_overflow;
2507
2508       InstructionCost Cost = 0;
2509       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2510                                     nullptr, ScalarizationCostPassed);
2511       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2512       Cost +=
2513           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2514                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2515       return Cost;
2516     }
2517     case Intrinsic::smul_fix:
2518     case Intrinsic::umul_fix: {
2519       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2520       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2521
2522       unsigned ExtOp =
2523           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2524       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2525
2526       InstructionCost Cost = 0;
2527       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2528       Cost +=
2529           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2530       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2531                                             CCH, CostKind);
2532       Cost += thisT()->getArithmeticInstrCost(
2533           Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2534           {TTI::OK_UniformConstantValue, TTI::OP_None});
2535       Cost += thisT()->getArithmeticInstrCost(
2536           Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2537           {TTI::OK_UniformConstantValue, TTI::OP_None});
2538       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2539       return Cost;
2540     }
2541     case Intrinsic::abs: {
2542       // abs(X) = select(icmp(X,0),X,sub(0,X))
2543       Type *CondTy = RetTy->getWithNewBitWidth(1);
2544       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2545       InstructionCost Cost = 0;
2546       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2547                                           Pred, CostKind);
2548       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2549                                           Pred, CostKind);
2550       // TODO: Should we add an OperandValueProperties::OP_Zero property?
2551       Cost += thisT()->getArithmeticInstrCost(
2552           BinaryOperator::Sub, RetTy, CostKind,
2553           {TTI::OK_UniformConstantValue, TTI::OP_None});
2554       return Cost;
2555     }
2556     case Intrinsic::fshl:
2557     case Intrinsic::fshr: {
2558       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2559       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2560       Type *CondTy = RetTy->getWithNewBitWidth(1);
2561       InstructionCost Cost = 0;
2562       Cost +=
2563           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2564       Cost +=
2565           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
2566       Cost +=
2567           thisT()->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
2568       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
2569                                               CostKind);
2570       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
2571                                               CostKind);
2572       // Shift-by-zero handling.
2573       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2574                                           CmpInst::ICMP_EQ, CostKind);
2575       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2576                                           CmpInst::ICMP_EQ, CostKind);
2577       return Cost;
2578     }
2579     case Intrinsic::fptosi_sat:
2580     case Intrinsic::fptoui_sat: {
2581       if (Tys.empty())
2582         break;
2583       Type *FromTy = Tys[0];
2584       bool IsSigned = IID == Intrinsic::fptosi_sat;
2585
2586       InstructionCost Cost = 0;
2587       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2588                                      {FromTy, FromTy});
2589       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2590       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2591                                      {FromTy, FromTy});
2592       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2593       Cost += thisT()->getCastInstrCost(
2594           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2595           TTI::CastContextHint::None, CostKind);
2596       if (IsSigned) {
2597         Type *CondTy = RetTy->getWithNewBitWidth(1);
2598         Cost += thisT()->getCmpSelInstrCost(
2599             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2600         Cost += thisT()->getCmpSelInstrCost(
2601             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2602       }
2603       return Cost;
2604     }
2605     case Intrinsic::ucmp:
2606     case Intrinsic::scmp: {
2607       Type *CmpTy = Tys[0];
2608       Type *CondTy = RetTy->getWithNewBitWidth(1);
2609       InstructionCost Cost =
2610           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2611                                       CmpIntrinsic::getGTPredicate(IID),
2612                                       CostKind) +
2613           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2614                                       CmpIntrinsic::getLTPredicate(IID),
2615                                       CostKind);
2616
2617       EVT VT = TLI->getValueType(DL, CmpTy, true);
2618       if (TLI->shouldExpandCmpUsingSelects(VT)) {
2619         // x < y ? -1 : (x > y ? 1 : 0)
2620         Cost += 2 * thisT()->getCmpSelInstrCost(
2621                         BinaryOperator::Select, RetTy, CondTy,
2622                         ICmpInst::BAD_ICMP_PREDICATE, CostKind);
2623       } else {
2624         // zext(x > y) - zext(x < y)
2625         Cost +=
2626             2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2627                                           TTI::CastContextHint::None, CostKind);
2628         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2629                                                 CostKind);
2630       }
2631       return Cost;
2632     }
2633     default:
2634       break;
2635     }
2636
2637     // Else, assume that we need to scalarize this intrinsic. For math builtins
2638     // this will emit a costly libcall, adding call overhead and spills. Make it
2639     // very expensive.
2640     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2641       // Scalable vectors cannot be scalarized, so return Invalid.
2642       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2643             return isa<ScalableVectorType>(Ty);
2644           }))
2645         return InstructionCost::getInvalid();
2646
2647       InstructionCost ScalarizationCost =
2648           SkipScalarizationCost
2649               ? ScalarizationCostPassed
2650               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2651                                          /*Extract*/ false, CostKind);
2652
2653       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2654       SmallVector<Type *, 4> ScalarTys;
2655       for (Type *Ty : Tys) {
2656         if (Ty->isVectorTy())
2657           Ty = Ty->getScalarType();
2658         ScalarTys.push_back(Ty);
2659       }
2660       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2661       InstructionCost ScalarCost =
2662           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2663       for (Type *Ty : Tys) {
2664         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2665           if (!ICA.skipScalarizationCost())
2666             ScalarizationCost += getScalarizationOverhead(
2667                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2668           ScalarCalls = std::max(ScalarCalls,
2669                                  cast<FixedVectorType>(VTy)->getNumElements());
2670         }
2671       }
2672       return ScalarCalls * ScalarCost + ScalarizationCost;
2673     }
2674
2675     // This is going to be turned into a library call, make it expensive.
2676     return SingleCallCost;
2677   }
2678
2679   /// Compute a cost of the given call instruction.
2680   ///
2681   /// Compute the cost of calling function F with return type RetTy and
2682   /// argument types Tys. F might be nullptr, in this case the cost of an
2683   /// arbitrary call with the specified signature will be returned.
2684   /// This is used, for instance,  when we estimate call of a vector
2685   /// counterpart of the given function.
2686   /// \param F Called function, might be nullptr.
2687   /// \param RetTy Return value types.
2688   /// \param Tys Argument types.
2689   /// \returns The cost of Call instruction.
2690   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2691                                    ArrayRef<Type *> Tys,
2692                                    TTI::TargetCostKind CostKind) {
2693     return 10;
2694   }
2695
2696   unsigned getNumberOfParts(Type *Tp) {
2697     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2698     if (!LT.first.isValid())
2699       return 0;
2700     // Try to find actual number of parts for non-power-of-2 elements as
2701     // ceil(num-of-elements/num-of-subtype-elements).
2702     if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2703         Tp && LT.second.isFixedLengthVector() &&
2704         !has_single_bit(FTp->getNumElements())) {
2705       if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2706               EVT(LT.second).getTypeForEVT(Tp->getContext()));
2707           SubTp && SubTp->getElementType() == FTp->getElementType())
2708         return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2709     }
2710     return *LT.first.getValue();
2711   }
2712
2713   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2714                                             const SCEV *) {
2715     return 0;
2716   }
2717
2718   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2719   /// We're assuming that reduction operation are performing the following way:
2720   ///
2721   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2722   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2723   ///            \----------------v-------------/  \----------v------------/
2724   ///                            n/2 elements               n/2 elements
2725   /// %red1 = op <n x t> %val, <n x t> val1
2726   /// After this operation we have a vector %red1 where only the first n/2
2727   /// elements are meaningful, the second n/2 elements are undefined and can be
2728   /// dropped. All other operations are actually working with the vector of
2729   /// length n/2, not n, though the real vector length is still n.
2730   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2731   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2732   ///            \----------------v-------------/  \----------v------------/
2733   ///                            n/4 elements               3*n/4 elements
2734   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2735   /// length n/2, the resulting vector has length n/4 etc.
2736   ///
2737   /// The cost model should take into account that the actual length of the
2738   /// vector is reduced on each iteration.
2739   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2740                                        TTI::TargetCostKind CostKind) {
2741     // Targets must implement a default value for the scalable case, since
2742     // we don't know how many lanes the vector has.
2743     if (isa<ScalableVectorType>(Ty))
2744       return InstructionCost::getInvalid();
2745
2746     Type *ScalarTy = Ty->getElementType();
2747     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2748     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2749         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2750         NumVecElts >= 2) {
2751       // Or reduction for i1 is represented as:
2752       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2753       // %res = cmp ne iReduxWidth %val, 0
2754       // And reduction for i1 is represented as:
2755       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2756       // %res = cmp eq iReduxWidth %val, 11111
2757       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2758       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2759                                        TTI::CastContextHint::None, CostKind) +
2760              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2761                                          CmpInst::makeCmpResultType(ValTy),
2762                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2763     }
2764     unsigned NumReduxLevels = Log2_32(NumVecElts);
2765     InstructionCost ArithCost = 0;
2766     InstructionCost ShuffleCost = 0;
2767     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2768     unsigned LongVectorCount = 0;
2769     unsigned MVTLen =
2770         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2771     while (NumVecElts > MVTLen) {
2772       NumVecElts /= 2;
2773       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2774       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2775                                              CostKind, NumVecElts, SubTy);
2776       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2777       Ty = SubTy;
2778       ++LongVectorCount;
2779     }
2780
2781     NumReduxLevels -= LongVectorCount;
2782
2783     // The minimal length of the vector is limited by the real length of vector
2784     // operations performed on the current platform. That's why several final
2785     // reduction operations are performed on the vectors with the same
2786     // architecture-dependent length.
2787
2788     // By default reductions need one shuffle per reduction level.
2789     ShuffleCost +=
2790         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2791                                                  {}, CostKind, 0, Ty);
2792     ArithCost +=
2793         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2794     return ShuffleCost + ArithCost +
2795            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2796                                        CostKind, 0, nullptr, nullptr);
2797   }
2798
2799   /// Try to calculate the cost of performing strict (in-order) reductions,
2800   /// which involves doing a sequence of floating point additions in lane
2801   /// order, starting with an initial value. For example, consider a scalar
2802   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2803   ///
2804   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2805   ///
2806   ///   %add1 = %InitVal + %v0
2807   ///   %add2 = %add1 + %v1
2808   ///   %add3 = %add2 + %v2
2809   ///   %add4 = %add3 + %v3
2810   ///
2811   /// As a simple estimate we can say the cost of such a reduction is 4 times
2812   /// the cost of a scalar FP addition. We can only estimate the costs for
2813   /// fixed-width vectors here because for scalable vectors we do not know the
2814   /// runtime number of operations.
2815   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2816                                           TTI::TargetCostKind CostKind) {
2817     // Targets must implement a default value for the scalable case, since
2818     // we don't know how many lanes the vector has.
2819     if (isa<ScalableVectorType>(Ty))
2820       return InstructionCost::getInvalid();
2821
2822     auto *VTy = cast<FixedVectorType>(Ty);
2823     InstructionCost ExtractCost = getScalarizationOverhead(
2824         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2825     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2826         Opcode, VTy->getElementType(), CostKind);
2827     ArithCost *= VTy->getNumElements();
2828
2829     return ExtractCost + ArithCost;
2830   }
2831
2832   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2833                                              std::optional<FastMathFlags> FMF,
2834                                              TTI::TargetCostKind CostKind) {
2835     assert(Ty && "Unknown reduction vector type");
2836     if (TTI::requiresOrderedReduction(FMF))
2837       return getOrderedReductionCost(Opcode, Ty, CostKind);
2838     return getTreeReductionCost(Opcode, Ty, CostKind);
2839   }
2840
2841   /// Try to calculate op costs for min/max reduction operations.
2842   /// \param CondTy Conditional type for the Select instruction.
2843   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2844                                          FastMathFlags FMF,
2845                                          TTI::TargetCostKind CostKind) {
2846     // Targets must implement a default value for the scalable case, since
2847     // we don't know how many lanes the vector has.
2848     if (isa<ScalableVectorType>(Ty))
2849       return InstructionCost::getInvalid();
2850
2851     Type *ScalarTy = Ty->getElementType();
2852     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2853     unsigned NumReduxLevels = Log2_32(NumVecElts);
2854     InstructionCost MinMaxCost = 0;
2855     InstructionCost ShuffleCost = 0;
2856     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2857     unsigned LongVectorCount = 0;
2858     unsigned MVTLen =
2859         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2860     while (NumVecElts > MVTLen) {
2861       NumVecElts /= 2;
2862       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2863
2864       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2865                                              CostKind, NumVecElts, SubTy);
2866
2867       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2868       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2869       Ty = SubTy;
2870       ++LongVectorCount;
2871     }
2872
2873     NumReduxLevels -= LongVectorCount;
2874
2875     // The minimal length of the vector is limited by the real length of vector
2876     // operations performed on the current platform. That's why several final
2877     // reduction opertions are perfomed on the vectors with the same
2878     // architecture-dependent length.
2879     ShuffleCost +=
2880         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2881                                                  {}, CostKind, 0, Ty);
2882     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2883     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2884     // The last min/max should be in vector registers and we counted it above.
2885     // So just need a single extractelement.
2886     return ShuffleCost + MinMaxCost +
2887            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2888                                        CostKind, 0, nullptr, nullptr);
2889   }
2890
2891   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2892                                            Type *ResTy, VectorType *Ty,
2893                                            FastMathFlags FMF,
2894                                            TTI::TargetCostKind CostKind) {
2895     if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
2896         FTy && IsUnsigned && Opcode == Instruction::Add &&
2897         FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
2898       // Represent vector_reduce_add(ZExt(<n x i1>)) as
2899       // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2900       auto *IntTy =
2901           IntegerType::get(ResTy->getContext(), FTy->getNumElements());
2902       IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
2903       return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
2904                                        TTI::CastContextHint::None, CostKind) +
2905              thisT()->getIntrinsicInstrCost(ICA, CostKind);
2906     }
2907     // Without any native support, this is equivalent to the cost of
2908     // vecreduce.opcode(ext(Ty A)).
2909     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2910     InstructionCost RedCost =
2911         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2912     InstructionCost ExtCost = thisT()->getCastInstrCost(
2913         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2914         TTI::CastContextHint::None, CostKind);
2915
2916     return RedCost + ExtCost;
2917   }
2918
2919   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2920                                          VectorType *Ty,
2921                                          TTI::TargetCostKind CostKind) {
2922     // Without any native support, this is equivalent to the cost of
2923     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2924     // vecreduce.add(mul(A, B)).
2925     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2926     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2927         Instruction::Add, ExtTy, std::nullopt, CostKind);
2928     InstructionCost ExtCost = thisT()->getCastInstrCost(
2929         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2930         TTI::CastContextHint::None, CostKind);
2931
2932     InstructionCost MulCost =
2933         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2934
2935     return RedCost + MulCost + 2 * ExtCost;
2936   }
2937
2938   InstructionCost getVectorSplitCost() { return 1; }
2939
2940   /// @}
2941 };
2942
2943 /// Concrete BasicTTIImpl that can be used if no further customization
2944 /// is needed.
2945 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2946   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2947
2948   friend class BasicTTIImplBase<BasicTTIImpl>;
2949
2950   const TargetSubtargetInfo *ST;
2951   const TargetLoweringBase *TLI;
2952
2953   const TargetSubtargetInfo *getST() const { return ST; }
2954   const TargetLoweringBase *getTLI() const { return TLI; }
2955
2956 public:
2957   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2958 };
2959
2960 } // end namespace llvm
2961
2962 #endif // LLVM_CODEGEN_BASICTTIIMPL_H