1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/CodeGen/ISDOpcodes.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/TargetSubtargetInfo.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/CodeGenTypes/MachineValueType.h"
34 #include "llvm/IR/BasicBlock.h"
35 #include "llvm/IR/Constant.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/InstrTypes.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Operator.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/ErrorHandling.h"
49 #include "llvm/Support/MathExtras.h"
50 #include "llvm/Target/TargetMachine.h"
51 #include "llvm/Target/TargetOptions.h"
52 #include "llvm/Transforms/Utils/LoopUtils.h"
53 #include <algorithm>
54 #include <cassert>
55 #include <cstdint>
56 #include <limits>
57 #include <optional>
58 #include <utility>
59 
60 namespace llvm {
61 
62 class Function;
63 class GlobalValue;
64 class LLVMContext;
65 class ScalarEvolution;
66 class SCEV;
67 class TargetMachine;
68 
69 extern cl::opt<unsigned> PartialUnrollingThreshold;
70 
71 /// Base class which can be used to help build a TTI implementation.
72 ///
73 /// This class provides as much implementation of the TTI interface as is
74 /// possible using the target independent parts of the code generator.
75 ///
76 /// In order to subclass it, your class must implement a getST() method to
77 /// return the subtarget, and a getTLI() method to return the target lowering.
78 /// We need these methods implemented in the derived class so that this class
79 /// doesn't have to duplicate storage for them.
80 template <typename T>
81 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
82 private:
83   using BaseT = TargetTransformInfoImplCRTPBase<T>;
84   using TTI = TargetTransformInfo;
85 
86   /// Helper function to access this as a T.
thisT()87   T *thisT() { return static_cast<T *>(this); }
88 
89   /// Estimate a cost of Broadcast as an extract and sequence of insert
90   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)91   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
92                                               TTI::TargetCostKind CostKind) {
93     InstructionCost Cost = 0;
94     // Broadcast cost is equal to the cost of extracting the zero'th element
95     // plus the cost of inserting it into every element of the result vector.
96     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97                                         CostKind, 0, nullptr, nullptr);
98 
99     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101                                           CostKind, i, nullptr, nullptr);
102     }
103     return Cost;
104   }
105 
106   /// Estimate a cost of shuffle as a sequence of extract and insert
107   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)108   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
109                                             TTI::TargetCostKind CostKind) {
110     InstructionCost Cost = 0;
111     // Shuffle cost is equal to the cost of extracting element from its argument
112     // plus the cost of inserting them onto the result vector.
113 
114     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115     // index 0 of first vector, index 1 of second vector,index 2 of first
116     // vector and finally index 3 of second vector and insert them at index
117     // <0,1,2,3> of result vector.
118     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120                                           CostKind, i, nullptr, nullptr);
121       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122                                           CostKind, i, nullptr, nullptr);
123     }
124     return Cost;
125   }
126 
127   /// Estimate a cost of subvector extraction as a sequence of extract and
128   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)129   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
130                                               TTI::TargetCostKind CostKind,
131                                               int Index,
132                                               FixedVectorType *SubVTy) {
133     assert(VTy && SubVTy &&
134            "Can only extract subvectors from vectors");
135     int NumSubElts = SubVTy->getNumElements();
136     assert((!isa<FixedVectorType>(VTy) ||
137             (Index + NumSubElts) <=
138                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139            "SK_ExtractSubvector index out of range");
140 
141     InstructionCost Cost = 0;
142     // Subvector extraction cost is equal to the cost of extracting element from
143     // the source type plus the cost of inserting them into the result vector
144     // type.
145     for (int i = 0; i != NumSubElts; ++i) {
146       Cost +=
147           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148                                       CostKind, i + Index, nullptr, nullptr);
149       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150                                           CostKind, i, nullptr, nullptr);
151     }
152     return Cost;
153   }
154 
155   /// Estimate a cost of subvector insertion as a sequence of extract and
156   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)157   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
158                                              TTI::TargetCostKind CostKind,
159                                              int Index,
160                                              FixedVectorType *SubVTy) {
161     assert(VTy && SubVTy &&
162            "Can only insert subvectors into vectors");
163     int NumSubElts = SubVTy->getNumElements();
164     assert((!isa<FixedVectorType>(VTy) ||
165             (Index + NumSubElts) <=
166                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167            "SK_InsertSubvector index out of range");
168 
169     InstructionCost Cost = 0;
170     // Subvector insertion cost is equal to the cost of extracting element from
171     // the source type plus the cost of inserting them into the result vector
172     // type.
173     for (int i = 0; i != NumSubElts; ++i) {
174       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175                                           CostKind, i, nullptr, nullptr);
176       Cost +=
177           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178                                       i + Index, nullptr, nullptr);
179     }
180     return Cost;
181   }
182 
183   /// Local query method delegates up to T which *must* implement this!
getST()184   const TargetSubtargetInfo *getST() const {
185     return static_cast<const T *>(this)->getST();
186   }
187 
188   /// Local query method delegates up to T which *must* implement this!
getTLI()189   const TargetLoweringBase *getTLI() const {
190     return static_cast<const T *>(this)->getTLI();
191   }
192 
getISDIndexedMode(TTI::MemIndexedMode M)193   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194     switch (M) {
195       case TTI::MIM_Unindexed:
196         return ISD::UNINDEXED;
197       case TTI::MIM_PreInc:
198         return ISD::PRE_INC;
199       case TTI::MIM_PreDec:
200         return ISD::PRE_DEC;
201       case TTI::MIM_PostInc:
202         return ISD::POST_INC;
203       case TTI::MIM_PostDec:
204         return ISD::POST_DEC;
205     }
206     llvm_unreachable("Unexpected MemIndexedMode");
207   }
208 
209   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210                                               Align Alignment,
211                                               bool VariableMask,
212                                               bool IsGatherScatter,
213                                               TTI::TargetCostKind CostKind,
214                                               unsigned AddressSpace = 0) {
215     // We cannot scalarize scalable vectors, so return Invalid.
216     if (isa<ScalableVectorType>(DataTy))
217       return InstructionCost::getInvalid();
218 
219     auto *VT = cast<FixedVectorType>(DataTy);
220     unsigned VF = VT->getNumElements();
221 
222     // Assume the target does not have support for gather/scatter operations
223     // and provide a rough estimate.
224     //
225     // First, compute the cost of the individual memory operations.
226     InstructionCost AddrExtractCost =
227         IsGatherScatter
228             ? getScalarizationOverhead(
229                   FixedVectorType::get(
230                       PointerType::get(VT->getElementType(), 0), VF),
231                   /*Insert=*/false, /*Extract=*/true, CostKind)
232             : 0;
233 
234     // The cost of the scalar loads/stores.
235     InstructionCost MemoryOpCost =
236         VF * getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
237                              AddressSpace, CostKind);
238 
239     // Next, compute the cost of packing the result in a vector.
240     InstructionCost PackingCost =
241         getScalarizationOverhead(VT, Opcode != Instruction::Store,
242                                  Opcode == Instruction::Store, CostKind);
243 
244     InstructionCost ConditionalCost = 0;
245     if (VariableMask) {
246       // Compute the cost of conditionally executing the memory operations with
247       // variable masks. This includes extracting the individual conditions, a
248       // branches and PHIs to combine the results.
249       // NOTE: Estimating the cost of conditionally executing the memory
250       // operations accurately is quite difficult and the current solution
251       // provides a very rough estimate only.
252       ConditionalCost =
253           getScalarizationOverhead(
254               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
255               /*Insert=*/false, /*Extract=*/true, CostKind) +
256           VF * (getCFInstrCost(Instruction::Br, CostKind) +
257                 getCFInstrCost(Instruction::PHI, CostKind));
258     }
259 
260     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261   }
262 
263 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)264   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
265       : BaseT(DL) {}
266   virtual ~BasicTTIImplBase() = default;
267 
268   using TargetTransformInfoImplBase::DL;
269 
270 public:
271   /// \name Scalar TTI Implementations
272   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,unsigned * Fast)273   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
274                                       unsigned AddressSpace, Align Alignment,
275                                       unsigned *Fast) const {
276     EVT E = EVT::getIntegerVT(Context, BitWidth);
277     return getTLI()->allowsMisalignedMemoryAccesses(
278         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
279   }
280 
281   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282 
isSourceOfDivergence(const Value * V)283   bool isSourceOfDivergence(const Value *V) { return false; }
284 
isAlwaysUniform(const Value * V)285   bool isAlwaysUniform(const Value *V) { return false; }
286 
isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)287   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288     return false;
289   }
290 
addrspacesMayAlias(unsigned AS0,unsigned AS1)291   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292     return true;
293   }
294 
getFlatAddressSpace()295   unsigned getFlatAddressSpace() {
296     // Return an invalid address space.
297     return -1;
298   }
299 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)300   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
301                                   Intrinsic::ID IID) const {
302     return false;
303   }
304 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)305   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307   }
308 
getAssumedAddrSpace(const Value * V)309   unsigned getAssumedAddrSpace(const Value *V) const {
310     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311   }
312 
isSingleThreaded()313   bool isSingleThreaded() const {
314     return getTLI()->getTargetMachine().Options.ThreadModel ==
315            ThreadModel::Single;
316   }
317 
318   std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V)319   getPredicatedAddrSpace(const Value *V) const {
320     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321   }
322 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)323   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
324                                           Value *NewV) const {
325     return nullptr;
326   }
327 
isLegalAddImmediate(int64_t imm)328   bool isLegalAddImmediate(int64_t imm) {
329     return getTLI()->isLegalAddImmediate(imm);
330   }
331 
isLegalAddScalableImmediate(int64_t Imm)332   bool isLegalAddScalableImmediate(int64_t Imm) {
333     return getTLI()->isLegalAddScalableImmediate(Imm);
334   }
335 
isLegalICmpImmediate(int64_t imm)336   bool isLegalICmpImmediate(int64_t imm) {
337     return getTLI()->isLegalICmpImmediate(imm);
338   }
339 
340   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342                              Instruction *I = nullptr,
343                              int64_t ScalableOffset = 0) {
344     TargetLoweringBase::AddrMode AM;
345     AM.BaseGV = BaseGV;
346     AM.BaseOffs = BaseOffset;
347     AM.HasBaseReg = HasBaseReg;
348     AM.Scale = Scale;
349     AM.ScalableOffset = ScalableOffset;
350     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351   }
352 
getPreferredLargeGEPBaseOffset(int64_t MinOffset,int64_t MaxOffset)353   int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354     return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355   }
356 
getStoreMinimumVF(unsigned VF,Type * ScalarMemTy,Type * ScalarValTy)357   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358                              Type *ScalarValTy) const {
359     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361       EVT VT = getTLI()->getValueType(DL, SrcTy);
362       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363           getTLI()->isOperationCustom(ISD::STORE, VT))
364         return true;
365 
366       EVT ValVT =
367           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368       EVT LegalizedVT =
369           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371     };
372     while (VF > 2 && IsSupportedByTarget(VF))
373       VF /= 2;
374     return VF;
375   }
376 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)377   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
378                           const DataLayout &DL) const {
379     EVT VT = getTLI()->getValueType(DL, Ty);
380     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381   }
382 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)383   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
384                            const DataLayout &DL) const {
385     EVT VT = getTLI()->getValueType(DL, Ty);
386     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387   }
388 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)389   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
390     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
391   }
392 
isNumRegsMajorCostOfLSR()393   bool isNumRegsMajorCostOfLSR() {
394     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
395   }
396 
shouldFoldTerminatingConditionAfterLSR()397   bool shouldFoldTerminatingConditionAfterLSR() const {
398     return TargetTransformInfoImplBase::
399         shouldFoldTerminatingConditionAfterLSR();
400   }
401 
isProfitableLSRChainElement(Instruction * I)402   bool isProfitableLSRChainElement(Instruction *I) {
403     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
404   }
405 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)406   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
407                                        int64_t BaseOffset, bool HasBaseReg,
408                                        int64_t Scale, unsigned AddrSpace) {
409     TargetLoweringBase::AddrMode AM;
410     AM.BaseGV = BaseGV;
411     AM.BaseOffs = BaseOffset;
412     AM.HasBaseReg = HasBaseReg;
413     AM.Scale = Scale;
414     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
415       return 0;
416     return -1;
417   }
418 
isTruncateFree(Type * Ty1,Type * Ty2)419   bool isTruncateFree(Type *Ty1, Type *Ty2) {
420     return getTLI()->isTruncateFree(Ty1, Ty2);
421   }
422 
isProfitableToHoist(Instruction * I)423   bool isProfitableToHoist(Instruction *I) {
424     return getTLI()->isProfitableToHoist(I);
425   }
426 
useAA()427   bool useAA() const { return getST()->useAA(); }
428 
isTypeLegal(Type * Ty)429   bool isTypeLegal(Type *Ty) {
430     EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
431     return getTLI()->isTypeLegal(VT);
432   }
433 
getRegUsageForType(Type * Ty)434   unsigned getRegUsageForType(Type *Ty) {
435     EVT ETy = getTLI()->getValueType(DL, Ty);
436     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
437   }
438 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,Type * AccessType,TTI::TargetCostKind CostKind)439   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
440                              ArrayRef<const Value *> Operands, Type *AccessType,
441                              TTI::TargetCostKind CostKind) {
442     return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
443   }
444 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)445   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
446                                             unsigned &JumpTableSize,
447                                             ProfileSummaryInfo *PSI,
448                                             BlockFrequencyInfo *BFI) {
449     /// Try to find the estimated number of clusters. Note that the number of
450     /// clusters identified in this function could be different from the actual
451     /// numbers found in lowering. This function ignore switches that are
452     /// lowered with a mix of jump table / bit test / BTree. This function was
453     /// initially intended to be used when estimating the cost of switch in
454     /// inline cost heuristic, but it's a generic cost model to be used in other
455     /// places (e.g., in loop unrolling).
456     unsigned N = SI.getNumCases();
457     const TargetLoweringBase *TLI = getTLI();
458     const DataLayout &DL = this->getDataLayout();
459 
460     JumpTableSize = 0;
461     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
462 
463     // Early exit if both a jump table and bit test are not allowed.
464     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
465       return N;
466 
467     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
468     APInt MinCaseVal = MaxCaseVal;
469     for (auto CI : SI.cases()) {
470       const APInt &CaseVal = CI.getCaseValue()->getValue();
471       if (CaseVal.sgt(MaxCaseVal))
472         MaxCaseVal = CaseVal;
473       if (CaseVal.slt(MinCaseVal))
474         MinCaseVal = CaseVal;
475     }
476 
477     // Check if suitable for a bit test
478     if (N <= DL.getIndexSizeInBits(0u)) {
479       SmallPtrSet<const BasicBlock *, 4> Dests;
480       for (auto I : SI.cases())
481         Dests.insert(I.getCaseSuccessor());
482 
483       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
484                                      DL))
485         return 1;
486     }
487 
488     // Check if suitable for a jump table.
489     if (IsJTAllowed) {
490       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
491         return N;
492       uint64_t Range =
493           (MaxCaseVal - MinCaseVal)
494               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
495       // Check whether a range of clusters is dense enough for a jump table
496       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
497         JumpTableSize = Range;
498         return 1;
499       }
500     }
501     return N;
502   }
503 
shouldBuildLookupTables()504   bool shouldBuildLookupTables() {
505     const TargetLoweringBase *TLI = getTLI();
506     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
507            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
508   }
509 
shouldBuildRelLookupTables()510   bool shouldBuildRelLookupTables() const {
511     const TargetMachine &TM = getTLI()->getTargetMachine();
512     // If non-PIC mode, do not generate a relative lookup table.
513     if (!TM.isPositionIndependent())
514       return false;
515 
516     /// Relative lookup table entries consist of 32-bit offsets.
517     /// Do not generate relative lookup tables for large code models
518     /// in 64-bit achitectures where 32-bit offsets might not be enough.
519     if (TM.getCodeModel() == CodeModel::Medium ||
520         TM.getCodeModel() == CodeModel::Large)
521       return false;
522 
523     Triple TargetTriple = TM.getTargetTriple();
524     if (!TargetTriple.isArch64Bit())
525       return false;
526 
527     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
528     // there.
529     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
530       return false;
531 
532     return true;
533   }
534 
haveFastSqrt(Type * Ty)535   bool haveFastSqrt(Type *Ty) {
536     const TargetLoweringBase *TLI = getTLI();
537     EVT VT = TLI->getValueType(DL, Ty);
538     return TLI->isTypeLegal(VT) &&
539            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
540   }
541 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)542   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
543     return true;
544   }
545 
getFPOpCost(Type * Ty)546   InstructionCost getFPOpCost(Type *Ty) {
547     // Check whether FADD is available, as a proxy for floating-point in
548     // general.
549     const TargetLoweringBase *TLI = getTLI();
550     EVT VT = TLI->getValueType(DL, Ty);
551     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
552       return TargetTransformInfo::TCC_Basic;
553     return TargetTransformInfo::TCC_Expensive;
554   }
555 
preferToKeepConstantsAttached(const Instruction & Inst,const Function & Fn)556   bool preferToKeepConstantsAttached(const Instruction &Inst,
557                                      const Function &Fn) const {
558     switch (Inst.getOpcode()) {
559     default:
560       break;
561     case Instruction::SDiv:
562     case Instruction::SRem:
563     case Instruction::UDiv:
564     case Instruction::URem: {
565       if (!isa<ConstantInt>(Inst.getOperand(1)))
566         return false;
567       EVT VT = getTLI()->getValueType(DL, Inst.getType());
568       return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
569     }
570     };
571 
572     return false;
573   }
574 
getInliningThresholdMultiplier()575   unsigned getInliningThresholdMultiplier() const { return 1; }
adjustInliningThreshold(const CallBase * CB)576   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI)577   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
578     return 0;
579   }
580 
getInlinerVectorBonusPercent()581   int getInlinerVectorBonusPercent() const { return 150; }
582 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)583   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
584                                TTI::UnrollingPreferences &UP,
585                                OptimizationRemarkEmitter *ORE) {
586     // This unrolling functionality is target independent, but to provide some
587     // motivation for its intended use, for x86:
588 
589     // According to the Intel 64 and IA-32 Architectures Optimization Reference
590     // Manual, Intel Core models and later have a loop stream detector (and
591     // associated uop queue) that can benefit from partial unrolling.
592     // The relevant requirements are:
593     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
594     //    taken, and none of them may be calls.
595     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
596 
597     // According to the Software Optimization Guide for AMD Family 15h
598     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
599     // and loop buffer which can benefit from partial unrolling.
600     // The relevant requirements are:
601     //  - The loop must have fewer than 16 branches
602     //  - The loop must have less than 40 uops in all executed loop branches
603 
604     // The number of taken branches in a loop is hard to estimate here, and
605     // benchmarking has revealed that it is better not to be conservative when
606     // estimating the branch count. As a result, we'll ignore the branch limits
607     // until someone finds a case where it matters in practice.
608 
609     unsigned MaxOps;
610     const TargetSubtargetInfo *ST = getST();
611     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
612       MaxOps = PartialUnrollingThreshold;
613     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
614       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
615     else
616       return;
617 
618     // Scan the loop: don't unroll loops with calls.
619     for (BasicBlock *BB : L->blocks()) {
620       for (Instruction &I : *BB) {
621         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
622           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
623             if (!thisT()->isLoweredToCall(F))
624               continue;
625           }
626 
627           if (ORE) {
628             ORE->emit([&]() {
629               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
630                                         L->getHeader())
631                      << "advising against unrolling the loop because it "
632                         "contains a "
633                      << ore::NV("Call", &I);
634             });
635           }
636           return;
637         }
638       }
639     }
640 
641     // Enable runtime and partial unrolling up to the specified size.
642     // Enable using trip count upper bound to unroll loops.
643     UP.Partial = UP.Runtime = UP.UpperBound = true;
644     UP.PartialThreshold = MaxOps;
645 
646     // Avoid unrolling when optimizing for size.
647     UP.OptSizeThreshold = 0;
648     UP.PartialOptSizeThreshold = 0;
649 
650     // Set number of instructions optimized when "back edge"
651     // becomes "fall through" to default value of 2.
652     UP.BEInsns = 2;
653   }
654 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)655   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
656                              TTI::PeelingPreferences &PP) {
657     PP.PeelCount = 0;
658     PP.AllowPeeling = true;
659     PP.AllowLoopNestsPeeling = false;
660     PP.PeelProfiledIterations = true;
661   }
662 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)663   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
664                                 AssumptionCache &AC,
665                                 TargetLibraryInfo *LibInfo,
666                                 HardwareLoopInfo &HWLoopInfo) {
667     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
668   }
669 
preferPredicateOverEpilogue(TailFoldingInfo * TFI)670   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
671     return BaseT::preferPredicateOverEpilogue(TFI);
672   }
673 
674   TailFoldingStyle
675   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
676     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
677   }
678 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)679   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
680                                                IntrinsicInst &II) {
681     return BaseT::instCombineIntrinsic(IC, II);
682   }
683 
684   std::optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)685   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
686                                    APInt DemandedMask, KnownBits &Known,
687                                    bool &KnownBitsComputed) {
688     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
689                                                    KnownBitsComputed);
690   }
691 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)692   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
693       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
694       APInt &UndefElts2, APInt &UndefElts3,
695       std::function<void(Instruction *, unsigned, APInt, APInt &)>
696           SimplifyAndSetOp) {
697     return BaseT::simplifyDemandedVectorEltsIntrinsic(
698         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
699         SimplifyAndSetOp);
700   }
701 
702   virtual std::optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)703   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
704     return std::optional<unsigned>(
705         getST()->getCacheSize(static_cast<unsigned>(Level)));
706   }
707 
708   virtual std::optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)709   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
710     std::optional<unsigned> TargetResult =
711         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
712 
713     if (TargetResult)
714       return TargetResult;
715 
716     return BaseT::getCacheAssociativity(Level);
717   }
718 
getCacheLineSize()719   virtual unsigned getCacheLineSize() const {
720     return getST()->getCacheLineSize();
721   }
722 
getPrefetchDistance()723   virtual unsigned getPrefetchDistance() const {
724     return getST()->getPrefetchDistance();
725   }
726 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)727   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
728                                         unsigned NumStridedMemAccesses,
729                                         unsigned NumPrefetches,
730                                         bool HasCall) const {
731     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
732                                          NumPrefetches, HasCall);
733   }
734 
getMaxPrefetchIterationsAhead()735   virtual unsigned getMaxPrefetchIterationsAhead() const {
736     return getST()->getMaxPrefetchIterationsAhead();
737   }
738 
enableWritePrefetching()739   virtual bool enableWritePrefetching() const {
740     return getST()->enableWritePrefetching();
741   }
742 
shouldPrefetchAddressSpace(unsigned AS)743   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
744     return getST()->shouldPrefetchAddressSpace(AS);
745   }
746 
747   /// @}
748 
749   /// \name Vector TTI Implementations
750   /// @{
751 
getRegisterBitWidth(TargetTransformInfo::RegisterKind K)752   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
753     return TypeSize::getFixed(32);
754   }
755 
getMaxVScale()756   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
getVScaleForTuning()757   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
isVScaleKnownToBeAPowerOfTwo()758   bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
759 
760   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
761   /// are set if the demanded result elements need to be inserted and/or
762   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)763   InstructionCost getScalarizationOverhead(VectorType *InTy,
764                                            const APInt &DemandedElts,
765                                            bool Insert, bool Extract,
766                                            TTI::TargetCostKind CostKind) {
767     /// FIXME: a bitfield is not a reasonable abstraction for talking about
768     /// which elements are needed from a scalable vector
769     if (isa<ScalableVectorType>(InTy))
770       return InstructionCost::getInvalid();
771     auto *Ty = cast<FixedVectorType>(InTy);
772 
773     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
774            "Vector size mismatch");
775 
776     InstructionCost Cost = 0;
777 
778     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
779       if (!DemandedElts[i])
780         continue;
781       if (Insert)
782         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
783                                             CostKind, i, nullptr, nullptr);
784       if (Extract)
785         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
786                                             CostKind, i, nullptr, nullptr);
787     }
788 
789     return Cost;
790   }
791 
792   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract,TTI::TargetCostKind CostKind)793   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
794                                            bool Extract,
795                                            TTI::TargetCostKind CostKind) {
796     if (isa<ScalableVectorType>(InTy))
797       return InstructionCost::getInvalid();
798     auto *Ty = cast<FixedVectorType>(InTy);
799 
800     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
801     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
802                                              CostKind);
803   }
804 
805   /// Estimate the overhead of scalarizing an instructions unique
806   /// non-constant operands. The (potentially vector) types to use for each of
807   /// argument are passes via Tys.
808   InstructionCost
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)809   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
810                                    ArrayRef<Type *> Tys,
811                                    TTI::TargetCostKind CostKind) {
812     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
813 
814     InstructionCost Cost = 0;
815     SmallPtrSet<const Value*, 4> UniqueOperands;
816     for (int I = 0, E = Args.size(); I != E; I++) {
817       // Disregard things like metadata arguments.
818       const Value *A = Args[I];
819       Type *Ty = Tys[I];
820       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
821           !Ty->isPtrOrPtrVectorTy())
822         continue;
823 
824       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
825         if (auto *VecTy = dyn_cast<VectorType>(Ty))
826           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
827                                            /*Extract*/ true, CostKind);
828       }
829     }
830 
831     return Cost;
832   }
833 
834   /// Estimate the overhead of scalarizing the inputs and outputs of an
835   /// instruction, with return type RetTy and arguments Args of type Tys. If
836   /// Args are unknown (empty), then the cost associated with one argument is
837   /// added as a heuristic.
getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)838   InstructionCost getScalarizationOverhead(VectorType *RetTy,
839                                            ArrayRef<const Value *> Args,
840                                            ArrayRef<Type *> Tys,
841                                            TTI::TargetCostKind CostKind) {
842     InstructionCost Cost = getScalarizationOverhead(
843         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
844     if (!Args.empty())
845       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
846     else
847       // When no information on arguments is provided, we add the cost
848       // associated with one argument as a heuristic.
849       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
850                                        /*Extract*/ true, CostKind);
851 
852     return Cost;
853   }
854 
855   /// Estimate the cost of type-legalization and the legalized type.
getTypeLegalizationCost(Type * Ty)856   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
857     LLVMContext &C = Ty->getContext();
858     EVT MTy = getTLI()->getValueType(DL, Ty);
859 
860     InstructionCost Cost = 1;
861     // We keep legalizing the type until we find a legal kind. We assume that
862     // the only operation that costs anything is the split. After splitting
863     // we need to handle two types.
864     while (true) {
865       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
866 
867       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
868         // Ensure we return a sensible simple VT here, since many callers of
869         // this function require it.
870         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
871         return std::make_pair(InstructionCost::getInvalid(), VT);
872       }
873 
874       if (LK.first == TargetLoweringBase::TypeLegal)
875         return std::make_pair(Cost, MTy.getSimpleVT());
876 
877       if (LK.first == TargetLoweringBase::TypeSplitVector ||
878           LK.first == TargetLoweringBase::TypeExpandInteger)
879         Cost *= 2;
880 
881       // Do not loop with f128 type.
882       if (MTy == LK.second)
883         return std::make_pair(Cost, MTy.getSimpleVT());
884 
885       // Keep legalizing the type.
886       MTy = LK.second;
887     }
888   }
889 
getMaxInterleaveFactor(ElementCount VF)890   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
891 
892   InstructionCost getArithmeticInstrCost(
893       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
894       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
895       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
896       ArrayRef<const Value *> Args = std::nullopt,
897       const Instruction *CxtI = nullptr) {
898     // Check if any of the operands are vector operands.
899     const TargetLoweringBase *TLI = getTLI();
900     int ISD = TLI->InstructionOpcodeToISD(Opcode);
901     assert(ISD && "Invalid opcode");
902 
903     // TODO: Handle more cost kinds.
904     if (CostKind != TTI::TCK_RecipThroughput)
905       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
906                                            Opd1Info, Opd2Info,
907                                            Args, CxtI);
908 
909     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
910 
911     bool IsFloat = Ty->isFPOrFPVectorTy();
912     // Assume that floating point arithmetic operations cost twice as much as
913     // integer operations.
914     InstructionCost OpCost = (IsFloat ? 2 : 1);
915 
916     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
917       // The operation is legal. Assume it costs 1.
918       // TODO: Once we have extract/insert subvector cost we need to use them.
919       return LT.first * OpCost;
920     }
921 
922     if (!TLI->isOperationExpand(ISD, LT.second)) {
923       // If the operation is custom lowered, then assume that the code is twice
924       // as expensive.
925       return LT.first * 2 * OpCost;
926     }
927 
928     // An 'Expand' of URem and SRem is special because it may default
929     // to expanding the operation into a sequence of sub-operations
930     // i.e. X % Y -> X-(X/Y)*Y.
931     if (ISD == ISD::UREM || ISD == ISD::SREM) {
932       bool IsSigned = ISD == ISD::SREM;
933       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
934                                         LT.second) ||
935           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
936                                         LT.second)) {
937         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
938         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
939             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
940         InstructionCost MulCost =
941             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
942         InstructionCost SubCost =
943             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
944         return DivCost + MulCost + SubCost;
945       }
946     }
947 
948     // We cannot scalarize scalable vectors, so return Invalid.
949     if (isa<ScalableVectorType>(Ty))
950       return InstructionCost::getInvalid();
951 
952     // Else, assume that we need to scalarize this op.
953     // TODO: If one of the types get legalized by splitting, handle this
954     // similarly to what getCastInstrCost() does.
955     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
956       InstructionCost Cost = thisT()->getArithmeticInstrCost(
957           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
958           Args, CxtI);
959       // Return the cost of multiple scalar invocation plus the cost of
960       // inserting and extracting the values.
961       SmallVector<Type *> Tys(Args.size(), Ty);
962       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
963              VTy->getNumElements() * Cost;
964     }
965 
966     // We don't know anything about this scalar instruction.
967     return OpCost;
968   }
969 
improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask,VectorType * Ty,int & Index,VectorType * & SubTy)970   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
971                                               ArrayRef<int> Mask,
972                                               VectorType *Ty, int &Index,
973                                               VectorType *&SubTy) const {
974     if (Mask.empty())
975       return Kind;
976     int NumSrcElts = Ty->getElementCount().getKnownMinValue();
977     switch (Kind) {
978     case TTI::SK_PermuteSingleSrc:
979       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
980         return TTI::SK_Reverse;
981       if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
982         return TTI::SK_Broadcast;
983       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
984           (Index + Mask.size()) <= (size_t)NumSrcElts) {
985         SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
986         return TTI::SK_ExtractSubvector;
987       }
988       break;
989     case TTI::SK_PermuteTwoSrc: {
990       int NumSubElts;
991       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
992                                  Mask, NumSrcElts, NumSubElts, Index)) {
993         if (Index + NumSubElts > NumSrcElts)
994           return Kind;
995         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
996         return TTI::SK_InsertSubvector;
997       }
998       if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
999         return TTI::SK_Select;
1000       if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1001         return TTI::SK_Transpose;
1002       if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1003         return TTI::SK_Splice;
1004       break;
1005     }
1006     case TTI::SK_Select:
1007     case TTI::SK_Reverse:
1008     case TTI::SK_Broadcast:
1009     case TTI::SK_Transpose:
1010     case TTI::SK_InsertSubvector:
1011     case TTI::SK_ExtractSubvector:
1012     case TTI::SK_Splice:
1013       break;
1014     }
1015     return Kind;
1016   }
1017 
1018   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1019                                  ArrayRef<int> Mask,
1020                                  TTI::TargetCostKind CostKind, int Index,
1021                                  VectorType *SubTp,
1022                                  ArrayRef<const Value *> Args = std::nullopt,
1023                                  const Instruction *CxtI = nullptr) {
1024     switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1025     case TTI::SK_Broadcast:
1026       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1027         return getBroadcastShuffleOverhead(FVT, CostKind);
1028       return InstructionCost::getInvalid();
1029     case TTI::SK_Select:
1030     case TTI::SK_Splice:
1031     case TTI::SK_Reverse:
1032     case TTI::SK_Transpose:
1033     case TTI::SK_PermuteSingleSrc:
1034     case TTI::SK_PermuteTwoSrc:
1035       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1036         return getPermuteShuffleOverhead(FVT, CostKind);
1037       return InstructionCost::getInvalid();
1038     case TTI::SK_ExtractSubvector:
1039       return getExtractSubvectorOverhead(Tp, CostKind, Index,
1040                                          cast<FixedVectorType>(SubTp));
1041     case TTI::SK_InsertSubvector:
1042       return getInsertSubvectorOverhead(Tp, CostKind, Index,
1043                                         cast<FixedVectorType>(SubTp));
1044     }
1045     llvm_unreachable("Unknown TTI::ShuffleKind");
1046   }
1047 
1048   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1049                                    TTI::CastContextHint CCH,
1050                                    TTI::TargetCostKind CostKind,
1051                                    const Instruction *I = nullptr) {
1052     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1053       return 0;
1054 
1055     const TargetLoweringBase *TLI = getTLI();
1056     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1057     assert(ISD && "Invalid opcode");
1058     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1059     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1060 
1061     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1062     TypeSize DstSize = DstLT.second.getSizeInBits();
1063     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1064     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1065 
1066     switch (Opcode) {
1067     default:
1068       break;
1069     case Instruction::Trunc:
1070       // Check for NOOP conversions.
1071       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1072         return 0;
1073       [[fallthrough]];
1074     case Instruction::BitCast:
1075       // Bitcast between types that are legalized to the same type are free and
1076       // assume int to/from ptr of the same size is also free.
1077       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1078           SrcSize == DstSize)
1079         return 0;
1080       break;
1081     case Instruction::FPExt:
1082       if (I && getTLI()->isExtFree(I))
1083         return 0;
1084       break;
1085     case Instruction::ZExt:
1086       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1087         return 0;
1088       [[fallthrough]];
1089     case Instruction::SExt:
1090       if (I && getTLI()->isExtFree(I))
1091         return 0;
1092 
1093       // If this is a zext/sext of a load, return 0 if the corresponding
1094       // extending load exists on target and the result type is legal.
1095       if (CCH == TTI::CastContextHint::Normal) {
1096         EVT ExtVT = EVT::getEVT(Dst);
1097         EVT LoadVT = EVT::getEVT(Src);
1098         unsigned LType =
1099           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1100         if (DstLT.first == SrcLT.first &&
1101             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1102           return 0;
1103       }
1104       break;
1105     case Instruction::AddrSpaceCast:
1106       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1107                                    Dst->getPointerAddressSpace()))
1108         return 0;
1109       break;
1110     }
1111 
1112     auto *SrcVTy = dyn_cast<VectorType>(Src);
1113     auto *DstVTy = dyn_cast<VectorType>(Dst);
1114 
1115     // If the cast is marked as legal (or promote) then assume low cost.
1116     if (SrcLT.first == DstLT.first &&
1117         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1118       return SrcLT.first;
1119 
1120     // Handle scalar conversions.
1121     if (!SrcVTy && !DstVTy) {
1122       // Just check the op cost. If the operation is legal then assume it costs
1123       // 1.
1124       if (!TLI->isOperationExpand(ISD, DstLT.second))
1125         return 1;
1126 
1127       // Assume that illegal scalar instruction are expensive.
1128       return 4;
1129     }
1130 
1131     // Check vector-to-vector casts.
1132     if (DstVTy && SrcVTy) {
1133       // If the cast is between same-sized registers, then the check is simple.
1134       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1135 
1136         // Assume that Zext is done using AND.
1137         if (Opcode == Instruction::ZExt)
1138           return SrcLT.first;
1139 
1140         // Assume that sext is done using SHL and SRA.
1141         if (Opcode == Instruction::SExt)
1142           return SrcLT.first * 2;
1143 
1144         // Just check the op cost. If the operation is legal then assume it
1145         // costs
1146         // 1 and multiply by the type-legalization overhead.
1147         if (!TLI->isOperationExpand(ISD, DstLT.second))
1148           return SrcLT.first * 1;
1149       }
1150 
1151       // If we are legalizing by splitting, query the concrete TTI for the cost
1152       // of casting the original vector twice. We also need to factor in the
1153       // cost of the split itself. Count that as 1, to be consistent with
1154       // getTypeLegalizationCost().
1155       bool SplitSrc =
1156           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1157           TargetLowering::TypeSplitVector;
1158       bool SplitDst =
1159           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1160           TargetLowering::TypeSplitVector;
1161       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1162           DstVTy->getElementCount().isVector()) {
1163         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1164         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1165         T *TTI = static_cast<T *>(this);
1166         // If both types need to be split then the split is free.
1167         InstructionCost SplitCost =
1168             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1169         return SplitCost +
1170                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1171                                           CostKind, I));
1172       }
1173 
1174       // Scalarization cost is Invalid, can't assume any num elements.
1175       if (isa<ScalableVectorType>(DstVTy))
1176         return InstructionCost::getInvalid();
1177 
1178       // In other cases where the source or destination are illegal, assume
1179       // the operation will get scalarized.
1180       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1181       InstructionCost Cost = thisT()->getCastInstrCost(
1182           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1183 
1184       // Return the cost of multiple scalar invocation plus the cost of
1185       // inserting and extracting the values.
1186       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1187                                       CostKind) +
1188              Num * Cost;
1189     }
1190 
1191     // We already handled vector-to-vector and scalar-to-scalar conversions.
1192     // This
1193     // is where we handle bitcast between vectors and scalars. We need to assume
1194     //  that the conversion is scalarized in one way or another.
1195     if (Opcode == Instruction::BitCast) {
1196       // Illegal bitcasts are done by storing and loading from a stack slot.
1197       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1198                                                 /*Extract*/ true, CostKind)
1199                      : 0) +
1200              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1201                                                 /*Extract*/ false, CostKind)
1202                      : 0);
1203     }
1204 
1205     llvm_unreachable("Unhandled cast");
1206   }
1207 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1208   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1209                                            VectorType *VecTy, unsigned Index) {
1210     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1211     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1212                                        CostKind, Index, nullptr, nullptr) +
1213            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1214                                      TTI::CastContextHint::None, CostKind);
1215   }
1216 
1217   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1218                                  const Instruction *I = nullptr) {
1219     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1220   }
1221 
1222   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1223                                      CmpInst::Predicate VecPred,
1224                                      TTI::TargetCostKind CostKind,
1225                                      const Instruction *I = nullptr) {
1226     const TargetLoweringBase *TLI = getTLI();
1227     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1228     assert(ISD && "Invalid opcode");
1229 
1230     // TODO: Handle other cost kinds.
1231     if (CostKind != TTI::TCK_RecipThroughput)
1232       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1233                                        I);
1234 
1235     // Selects on vectors are actually vector selects.
1236     if (ISD == ISD::SELECT) {
1237       assert(CondTy && "CondTy must exist");
1238       if (CondTy->isVectorTy())
1239         ISD = ISD::VSELECT;
1240     }
1241     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1242 
1243     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1244         !TLI->isOperationExpand(ISD, LT.second)) {
1245       // The operation is legal. Assume it costs 1. Multiply
1246       // by the type-legalization overhead.
1247       return LT.first * 1;
1248     }
1249 
1250     // Otherwise, assume that the cast is scalarized.
1251     // TODO: If one of the types get legalized by splitting, handle this
1252     // similarly to what getCastInstrCost() does.
1253     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1254       if (isa<ScalableVectorType>(ValTy))
1255         return InstructionCost::getInvalid();
1256 
1257       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1258       if (CondTy)
1259         CondTy = CondTy->getScalarType();
1260       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1261           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1262 
1263       // Return the cost of multiple scalar invocation plus the cost of
1264       // inserting and extracting the values.
1265       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1266                                       /*Extract*/ false, CostKind) +
1267              Num * Cost;
1268     }
1269 
1270     // Unknown scalar opcode.
1271     return 1;
1272   }
1273 
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1274   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1275                                      TTI::TargetCostKind CostKind,
1276                                      unsigned Index, Value *Op0, Value *Op1) {
1277     return getRegUsageForType(Val->getScalarType());
1278   }
1279 
getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)1280   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1281                                      TTI::TargetCostKind CostKind,
1282                                      unsigned Index) {
1283     Value *Op0 = nullptr;
1284     Value *Op1 = nullptr;
1285     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1286       Op0 = IE->getOperand(0);
1287       Op1 = IE->getOperand(1);
1288     }
1289     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1290                                        Op1);
1291   }
1292 
getReplicationShuffleCost(Type * EltTy,int ReplicationFactor,int VF,const APInt & DemandedDstElts,TTI::TargetCostKind CostKind)1293   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1294                                             int VF,
1295                                             const APInt &DemandedDstElts,
1296                                             TTI::TargetCostKind CostKind) {
1297     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1298            "Unexpected size of DemandedDstElts.");
1299 
1300     InstructionCost Cost;
1301 
1302     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1303     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1304 
1305     // The Mask shuffling cost is extract all the elements of the Mask
1306     // and insert each of them Factor times into the wide vector:
1307     //
1308     // E.g. an interleaved group with factor 3:
1309     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1310     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1311     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1312     // The cost is estimated as extract all mask elements from the <8xi1> mask
1313     // vector and insert them factor times into the <24xi1> shuffled mask
1314     // vector.
1315     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1316     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1317                                               /*Insert*/ false,
1318                                               /*Extract*/ true, CostKind);
1319     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1320                                               /*Insert*/ true,
1321                                               /*Extract*/ false, CostKind);
1322 
1323     return Cost;
1324   }
1325 
1326   InstructionCost
1327   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1328                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1329                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1330                   const Instruction *I = nullptr) {
1331     assert(!Src->isVoidTy() && "Invalid type");
1332     // Assume types, such as structs, are expensive.
1333     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1334       return 4;
1335     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1336 
1337     // Assuming that all loads of legal types cost 1.
1338     InstructionCost Cost = LT.first;
1339     if (CostKind != TTI::TCK_RecipThroughput)
1340       return Cost;
1341 
1342     const DataLayout &DL = this->getDataLayout();
1343     if (Src->isVectorTy() &&
1344         // In practice it's not currently possible to have a change in lane
1345         // length for extending loads or truncating stores so both types should
1346         // have the same scalable property.
1347         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1348                             LT.second.getSizeInBits())) {
1349       // This is a vector load that legalizes to a larger type than the vector
1350       // itself. Unless the corresponding extending load or truncating store is
1351       // legal, then this will scalarize.
1352       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1353       EVT MemVT = getTLI()->getValueType(DL, Src);
1354       if (Opcode == Instruction::Store)
1355         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1356       else
1357         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1358 
1359       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1360         // This is a vector load/store for some illegal type that is scalarized.
1361         // We must account for the cost of building or decomposing the vector.
1362         Cost += getScalarizationOverhead(
1363             cast<VectorType>(Src), Opcode != Instruction::Store,
1364             Opcode == Instruction::Store, CostKind);
1365       }
1366     }
1367 
1368     return Cost;
1369   }
1370 
getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1371   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1372                                         Align Alignment, unsigned AddressSpace,
1373                                         TTI::TargetCostKind CostKind) {
1374     // TODO: Pass on AddressSpace when we have test coverage.
1375     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1376                                        CostKind);
1377   }
1378 
1379   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1380                                          const Value *Ptr, bool VariableMask,
1381                                          Align Alignment,
1382                                          TTI::TargetCostKind CostKind,
1383                                          const Instruction *I = nullptr) {
1384     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1385                                        true, CostKind);
1386   }
1387 
getStridedMemoryOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1388   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1389                                          const Value *Ptr, bool VariableMask,
1390                                          Align Alignment,
1391                                          TTI::TargetCostKind CostKind,
1392                                          const Instruction *I) {
1393     // For a target without strided memory operations (or for an illegal
1394     // operation type on one which does), assume we lower to a gather/scatter
1395     // operation.  (Which may in turn be scalarized.)
1396     return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1397                                            Alignment, CostKind, I);
1398   }
1399 
1400   InstructionCost getInterleavedMemoryOpCost(
1401       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1402       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1403       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1404 
1405     // We cannot scalarize scalable vectors, so return Invalid.
1406     if (isa<ScalableVectorType>(VecTy))
1407       return InstructionCost::getInvalid();
1408 
1409     auto *VT = cast<FixedVectorType>(VecTy);
1410 
1411     unsigned NumElts = VT->getNumElements();
1412     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1413 
1414     unsigned NumSubElts = NumElts / Factor;
1415     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1416 
1417     // Firstly, the cost of load/store operation.
1418     InstructionCost Cost;
1419     if (UseMaskForCond || UseMaskForGaps)
1420       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1421                                             AddressSpace, CostKind);
1422     else
1423       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1424                                       CostKind);
1425 
1426     // Legalize the vector type, and get the legalized and unlegalized type
1427     // sizes.
1428     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1429     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1430     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1431 
1432     // Scale the cost of the memory operation by the fraction of legalized
1433     // instructions that will actually be used. We shouldn't account for the
1434     // cost of dead instructions since they will be removed.
1435     //
1436     // E.g., An interleaved load of factor 8:
1437     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1438     //       %v0 = shufflevector %vec, undef, <0, 8>
1439     //
1440     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1441     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1442     // type). The other loads are unused.
1443     //
1444     // TODO: Note that legalization can turn masked loads/stores into unmasked
1445     // (legalized) loads/stores. This can be reflected in the cost.
1446     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1447       // The number of loads of a legal type it will take to represent a load
1448       // of the unlegalized vector type.
1449       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1450 
1451       // The number of elements of the unlegalized type that correspond to a
1452       // single legal instruction.
1453       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1454 
1455       // Determine which legal instructions will be used.
1456       BitVector UsedInsts(NumLegalInsts, false);
1457       for (unsigned Index : Indices)
1458         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1459           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1460 
1461       // Scale the cost of the load by the fraction of legal instructions that
1462       // will be used.
1463       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1464     }
1465 
1466     // Then plus the cost of interleave operation.
1467     assert(Indices.size() <= Factor &&
1468            "Interleaved memory op has too many members");
1469 
1470     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1471     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1472 
1473     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1474     for (unsigned Index : Indices) {
1475       assert(Index < Factor && "Invalid index for interleaved memory op");
1476       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1477         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1478     }
1479 
1480     if (Opcode == Instruction::Load) {
1481       // The interleave cost is similar to extract sub vectors' elements
1482       // from the wide vector, and insert them into sub vectors.
1483       //
1484       // E.g. An interleaved load of factor 2 (with one member of index 0):
1485       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1486       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1487       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1488       // <8 x i32> vector and insert them into a <4 x i32> vector.
1489       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1490           SubVT, DemandedAllSubElts,
1491           /*Insert*/ true, /*Extract*/ false, CostKind);
1492       Cost += Indices.size() * InsSubCost;
1493       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1494                                                 /*Insert*/ false,
1495                                                 /*Extract*/ true, CostKind);
1496     } else {
1497       // The interleave cost is extract elements from sub vectors, and
1498       // insert them into the wide vector.
1499       //
1500       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1501       // (using VF=4):
1502       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1503       //    %gaps.mask = <true, true, false, true, true, false,
1504       //                  true, true, false, true, true, false>
1505       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1506       //                           i32 Align, <12 x i1> %gaps.mask
1507       // The cost is estimated as extract all elements (of actual members,
1508       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1509       // i32> vector.
1510       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1511           SubVT, DemandedAllSubElts,
1512           /*Insert*/ false, /*Extract*/ true, CostKind);
1513       Cost += ExtSubCost * Indices.size();
1514       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1515                                                 /*Insert*/ true,
1516                                                 /*Extract*/ false, CostKind);
1517     }
1518 
1519     if (!UseMaskForCond)
1520       return Cost;
1521 
1522     Type *I8Type = Type::getInt8Ty(VT->getContext());
1523 
1524     Cost += thisT()->getReplicationShuffleCost(
1525         I8Type, Factor, NumSubElts,
1526         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1527         CostKind);
1528 
1529     // The Gaps mask is invariant and created outside the loop, therefore the
1530     // cost of creating it is not accounted for here. However if we have both
1531     // a MaskForGaps and some other mask that guards the execution of the
1532     // memory access, we need to account for the cost of And-ing the two masks
1533     // inside the loop.
1534     if (UseMaskForGaps) {
1535       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1536       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1537                                               CostKind);
1538     }
1539 
1540     return Cost;
1541   }
1542 
1543   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1544   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1545                                         TTI::TargetCostKind CostKind) {
1546     // Check for generically free intrinsics.
1547     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1548       return 0;
1549 
1550     // Assume that target intrinsics are cheap.
1551     Intrinsic::ID IID = ICA.getID();
1552     if (Function::isTargetIntrinsic(IID))
1553       return TargetTransformInfo::TCC_Basic;
1554 
1555     if (ICA.isTypeBasedOnly())
1556       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1557 
1558     Type *RetTy = ICA.getReturnType();
1559 
1560     ElementCount RetVF =
1561         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1562                              : ElementCount::getFixed(1));
1563     const IntrinsicInst *I = ICA.getInst();
1564     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1565     FastMathFlags FMF = ICA.getFlags();
1566     switch (IID) {
1567     default:
1568       break;
1569 
1570     case Intrinsic::powi:
1571       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1572         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1573         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1574                                                ShouldOptForSize)) {
1575           // The cost is modeled on the expansion performed by ExpandPowI in
1576           // SelectionDAGBuilder.
1577           APInt Exponent = RHSC->getValue().abs();
1578           unsigned ActiveBits = Exponent.getActiveBits();
1579           unsigned PopCount = Exponent.popcount();
1580           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1581                                  thisT()->getArithmeticInstrCost(
1582                                      Instruction::FMul, RetTy, CostKind);
1583           if (RHSC->isNegative())
1584             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1585                                                     CostKind);
1586           return Cost;
1587         }
1588       }
1589       break;
1590     case Intrinsic::cttz:
1591       // FIXME: If necessary, this should go in target-specific overrides.
1592       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1593         return TargetTransformInfo::TCC_Basic;
1594       break;
1595 
1596     case Intrinsic::ctlz:
1597       // FIXME: If necessary, this should go in target-specific overrides.
1598       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1599         return TargetTransformInfo::TCC_Basic;
1600       break;
1601 
1602     case Intrinsic::memcpy:
1603       return thisT()->getMemcpyCost(ICA.getInst());
1604 
1605     case Intrinsic::masked_scatter: {
1606       const Value *Mask = Args[3];
1607       bool VarMask = !isa<Constant>(Mask);
1608       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1609       return thisT()->getGatherScatterOpCost(Instruction::Store,
1610                                              ICA.getArgTypes()[0], Args[1],
1611                                              VarMask, Alignment, CostKind, I);
1612     }
1613     case Intrinsic::masked_gather: {
1614       const Value *Mask = Args[2];
1615       bool VarMask = !isa<Constant>(Mask);
1616       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1617       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1618                                              VarMask, Alignment, CostKind, I);
1619     }
1620     case Intrinsic::experimental_vp_strided_store: {
1621       const Value *Data = Args[0];
1622       const Value *Ptr = Args[1];
1623       const Value *Mask = Args[3];
1624       const Value *EVL = Args[4];
1625       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1626       Align Alignment = I->getParamAlign(1).valueOrOne();
1627       return thisT()->getStridedMemoryOpCost(Instruction::Store,
1628                                              Data->getType(), Ptr, VarMask,
1629                                              Alignment, CostKind, I);
1630     }
1631     case Intrinsic::experimental_vp_strided_load: {
1632       const Value *Ptr = Args[0];
1633       const Value *Mask = Args[2];
1634       const Value *EVL = Args[3];
1635       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1636       Align Alignment = I->getParamAlign(0).valueOrOne();
1637       return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1638                                              VarMask, Alignment, CostKind, I);
1639     }
1640     case Intrinsic::experimental_stepvector: {
1641       if (isa<ScalableVectorType>(RetTy))
1642         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1643       // The cost of materialising a constant integer vector.
1644       return TargetTransformInfo::TCC_Basic;
1645     }
1646     case Intrinsic::vector_extract: {
1647       // FIXME: Handle case where a scalable vector is extracted from a scalable
1648       // vector
1649       if (isa<ScalableVectorType>(RetTy))
1650         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1651       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1652       return thisT()->getShuffleCost(
1653           TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1654           std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1655     }
1656     case Intrinsic::vector_insert: {
1657       // FIXME: Handle case where a scalable vector is inserted into a scalable
1658       // vector
1659       if (isa<ScalableVectorType>(Args[1]->getType()))
1660         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1661       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1662       return thisT()->getShuffleCost(
1663           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1664           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1665     }
1666     case Intrinsic::vector_reverse: {
1667       return thisT()->getShuffleCost(
1668           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1669           CostKind, 0, cast<VectorType>(RetTy));
1670     }
1671     case Intrinsic::vector_splice: {
1672       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1673       return thisT()->getShuffleCost(
1674           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1675           CostKind, Index, cast<VectorType>(RetTy));
1676     }
1677     case Intrinsic::vector_reduce_add:
1678     case Intrinsic::vector_reduce_mul:
1679     case Intrinsic::vector_reduce_and:
1680     case Intrinsic::vector_reduce_or:
1681     case Intrinsic::vector_reduce_xor:
1682     case Intrinsic::vector_reduce_smax:
1683     case Intrinsic::vector_reduce_smin:
1684     case Intrinsic::vector_reduce_fmax:
1685     case Intrinsic::vector_reduce_fmin:
1686     case Intrinsic::vector_reduce_fmaximum:
1687     case Intrinsic::vector_reduce_fminimum:
1688     case Intrinsic::vector_reduce_umax:
1689     case Intrinsic::vector_reduce_umin: {
1690       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1691       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1692     }
1693     case Intrinsic::vector_reduce_fadd:
1694     case Intrinsic::vector_reduce_fmul: {
1695       IntrinsicCostAttributes Attrs(
1696           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1697       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1698     }
1699     case Intrinsic::fshl:
1700     case Intrinsic::fshr: {
1701       const Value *X = Args[0];
1702       const Value *Y = Args[1];
1703       const Value *Z = Args[2];
1704       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1705       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1706       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1707       const TTI::OperandValueInfo OpInfoBW =
1708         {TTI::OK_UniformConstantValue,
1709          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1710          : TTI::OP_None};
1711 
1712       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1713       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1714       InstructionCost Cost = 0;
1715       Cost +=
1716           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1717       Cost +=
1718           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1719       Cost += thisT()->getArithmeticInstrCost(
1720           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1721           {OpInfoZ.Kind, TTI::OP_None});
1722       Cost += thisT()->getArithmeticInstrCost(
1723           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1724           {OpInfoZ.Kind, TTI::OP_None});
1725       // Non-constant shift amounts requires a modulo.
1726       if (!OpInfoZ.isConstant())
1727         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1728                                                 CostKind, OpInfoZ, OpInfoBW);
1729       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1730       if (X != Y) {
1731         Type *CondTy = RetTy->getWithNewBitWidth(1);
1732         Cost +=
1733             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1734                                         CmpInst::ICMP_EQ, CostKind);
1735         Cost +=
1736             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1737                                         CmpInst::ICMP_EQ, CostKind);
1738       }
1739       return Cost;
1740     }
1741     case Intrinsic::get_active_lane_mask: {
1742       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1743       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1744 
1745       // If we're not expanding the intrinsic then we assume this is cheap
1746       // to implement.
1747       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1748         return getTypeLegalizationCost(RetTy).first;
1749       }
1750 
1751       // Create the expanded types that will be used to calculate the uadd_sat
1752       // operation.
1753       Type *ExpRetTy = VectorType::get(
1754           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1755       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1756       InstructionCost Cost =
1757           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1758       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1759                                           CmpInst::ICMP_ULT, CostKind);
1760       return Cost;
1761     }
1762     case Intrinsic::experimental_cttz_elts: {
1763       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1764 
1765       // If we're not expanding the intrinsic then we assume this is cheap
1766       // to implement.
1767       if (!getTLI()->shouldExpandCttzElements(ArgType))
1768         return getTypeLegalizationCost(RetTy).first;
1769 
1770       // TODO: The costs below reflect the expansion code in
1771       // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1772       // favour of compile time.
1773 
1774       // Find the smallest "sensible" element type to use for the expansion.
1775       bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1776       ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1777       if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1778         VScaleRange = getVScaleRange(I->getCaller(), 64);
1779 
1780       unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1781           RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1782       Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1783 
1784       // Create the new vector type & get the vector length
1785       Type *NewVecTy = VectorType::get(
1786           NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1787 
1788       IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1789                                            NewVecTy, {}, FMF);
1790       InstructionCost Cost =
1791           thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1792 
1793       Cost +=
1794           thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1795       Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1796                                         Args[0]->getType(),
1797                                         TTI::CastContextHint::None, CostKind);
1798       Cost +=
1799           thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1800 
1801       IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1802                                          NewEltTy, NewVecTy, FMF, I, 1);
1803       Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1804       Cost +=
1805           thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1806 
1807       return Cost;
1808     }
1809     }
1810 
1811     // VP Intrinsics should have the same cost as their non-vp counterpart.
1812     // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1813     // counterpart when the vector length argument is smaller than the maximum
1814     // vector length.
1815     // TODO: Support other kinds of VPIntrinsics
1816     if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1817       std::optional<unsigned> FOp =
1818           VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1819       if (FOp) {
1820         if (ICA.getID() == Intrinsic::vp_load) {
1821           Align Alignment;
1822           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1823             Alignment = VPI->getPointerAlignment().valueOrOne();
1824           unsigned AS = 0;
1825           if (ICA.getArgs().size() > 1)
1826             if (auto *PtrTy =
1827                     dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1828               AS = PtrTy->getAddressSpace();
1829           return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1830                                           AS, CostKind);
1831         }
1832         if (ICA.getID() == Intrinsic::vp_store) {
1833           Align Alignment;
1834           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1835             Alignment = VPI->getPointerAlignment().valueOrOne();
1836           unsigned AS = 0;
1837           if (ICA.getArgs().size() >= 2)
1838             if (auto *PtrTy =
1839                     dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1840               AS = PtrTy->getAddressSpace();
1841           return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1842                                           AS, CostKind);
1843         }
1844         if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
1845           return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1846                                                  CostKind);
1847         }
1848       }
1849 
1850       std::optional<Intrinsic::ID> FID =
1851           VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
1852       if (FID) {
1853         // Non-vp version will have same Args/Tys except mask and vector length.
1854         assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1855                "Expected VPIntrinsic to have Mask and Vector Length args and "
1856                "types");
1857         ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
1858 
1859         // VPReduction intrinsics have a start value argument that their non-vp
1860         // counterparts do not have, except for the fadd and fmul non-vp
1861         // counterpart.
1862         if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
1863             *FID != Intrinsic::vector_reduce_fadd &&
1864             *FID != Intrinsic::vector_reduce_fmul)
1865           NewTys = NewTys.drop_front();
1866 
1867         IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1868                                        ICA.getFlags());
1869         return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1870       }
1871     }
1872 
1873     // Assume that we need to scalarize this intrinsic.)
1874     // Compute the scalarization overhead based on Args for a vector
1875     // intrinsic.
1876     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1877     if (RetVF.isVector() && !RetVF.isScalable()) {
1878       ScalarizationCost = 0;
1879       if (!RetTy->isVoidTy())
1880         ScalarizationCost += getScalarizationOverhead(
1881             cast<VectorType>(RetTy),
1882             /*Insert*/ true, /*Extract*/ false, CostKind);
1883       ScalarizationCost +=
1884           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1885     }
1886 
1887     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1888                                   ScalarizationCost);
1889     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1890   }
1891 
1892   /// Get intrinsic cost based on argument types.
1893   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1894   /// cost of scalarizing the arguments and the return value will be computed
1895   /// based on types.
1896   InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1897   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1898                                  TTI::TargetCostKind CostKind) {
1899     Intrinsic::ID IID = ICA.getID();
1900     Type *RetTy = ICA.getReturnType();
1901     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1902     FastMathFlags FMF = ICA.getFlags();
1903     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1904     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1905 
1906     VectorType *VecOpTy = nullptr;
1907     if (!Tys.empty()) {
1908       // The vector reduction operand is operand 0 except for fadd/fmul.
1909       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1910       unsigned VecTyIndex = 0;
1911       if (IID == Intrinsic::vector_reduce_fadd ||
1912           IID == Intrinsic::vector_reduce_fmul)
1913         VecTyIndex = 1;
1914       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1915       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1916     }
1917 
1918     // Library call cost - other than size, make it expensive.
1919     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1920     unsigned ISD = 0;
1921     switch (IID) {
1922     default: {
1923       // Scalable vectors cannot be scalarized, so return Invalid.
1924       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1925             return isa<ScalableVectorType>(Ty);
1926           }))
1927         return InstructionCost::getInvalid();
1928 
1929       // Assume that we need to scalarize this intrinsic.
1930       InstructionCost ScalarizationCost =
1931           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1932       unsigned ScalarCalls = 1;
1933       Type *ScalarRetTy = RetTy;
1934       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1935         if (!SkipScalarizationCost)
1936           ScalarizationCost = getScalarizationOverhead(
1937               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1938         ScalarCalls = std::max(ScalarCalls,
1939                                cast<FixedVectorType>(RetVTy)->getNumElements());
1940         ScalarRetTy = RetTy->getScalarType();
1941       }
1942       SmallVector<Type *, 4> ScalarTys;
1943       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1944         Type *Ty = Tys[i];
1945         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1946           if (!SkipScalarizationCost)
1947             ScalarizationCost += getScalarizationOverhead(
1948                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1949           ScalarCalls = std::max(ScalarCalls,
1950                                  cast<FixedVectorType>(VTy)->getNumElements());
1951           Ty = Ty->getScalarType();
1952         }
1953         ScalarTys.push_back(Ty);
1954       }
1955       if (ScalarCalls == 1)
1956         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1957 
1958       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1959       InstructionCost ScalarCost =
1960           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1961 
1962       return ScalarCalls * ScalarCost + ScalarizationCost;
1963     }
1964     // Look for intrinsics that can be lowered directly or turned into a scalar
1965     // intrinsic call.
1966     case Intrinsic::sqrt:
1967       ISD = ISD::FSQRT;
1968       break;
1969     case Intrinsic::sin:
1970       ISD = ISD::FSIN;
1971       break;
1972     case Intrinsic::cos:
1973       ISD = ISD::FCOS;
1974       break;
1975     case Intrinsic::exp:
1976       ISD = ISD::FEXP;
1977       break;
1978     case Intrinsic::exp2:
1979       ISD = ISD::FEXP2;
1980       break;
1981     case Intrinsic::exp10:
1982       ISD = ISD::FEXP10;
1983       break;
1984     case Intrinsic::log:
1985       ISD = ISD::FLOG;
1986       break;
1987     case Intrinsic::log10:
1988       ISD = ISD::FLOG10;
1989       break;
1990     case Intrinsic::log2:
1991       ISD = ISD::FLOG2;
1992       break;
1993     case Intrinsic::fabs:
1994       ISD = ISD::FABS;
1995       break;
1996     case Intrinsic::canonicalize:
1997       ISD = ISD::FCANONICALIZE;
1998       break;
1999     case Intrinsic::minnum:
2000       ISD = ISD::FMINNUM;
2001       break;
2002     case Intrinsic::maxnum:
2003       ISD = ISD::FMAXNUM;
2004       break;
2005     case Intrinsic::minimum:
2006       ISD = ISD::FMINIMUM;
2007       break;
2008     case Intrinsic::maximum:
2009       ISD = ISD::FMAXIMUM;
2010       break;
2011     case Intrinsic::copysign:
2012       ISD = ISD::FCOPYSIGN;
2013       break;
2014     case Intrinsic::floor:
2015       ISD = ISD::FFLOOR;
2016       break;
2017     case Intrinsic::ceil:
2018       ISD = ISD::FCEIL;
2019       break;
2020     case Intrinsic::trunc:
2021       ISD = ISD::FTRUNC;
2022       break;
2023     case Intrinsic::nearbyint:
2024       ISD = ISD::FNEARBYINT;
2025       break;
2026     case Intrinsic::rint:
2027       ISD = ISD::FRINT;
2028       break;
2029     case Intrinsic::lrint:
2030       ISD = ISD::LRINT;
2031       break;
2032     case Intrinsic::llrint:
2033       ISD = ISD::LLRINT;
2034       break;
2035     case Intrinsic::round:
2036       ISD = ISD::FROUND;
2037       break;
2038     case Intrinsic::roundeven:
2039       ISD = ISD::FROUNDEVEN;
2040       break;
2041     case Intrinsic::pow:
2042       ISD = ISD::FPOW;
2043       break;
2044     case Intrinsic::fma:
2045       ISD = ISD::FMA;
2046       break;
2047     case Intrinsic::fmuladd:
2048       ISD = ISD::FMA;
2049       break;
2050     case Intrinsic::experimental_constrained_fmuladd:
2051       ISD = ISD::STRICT_FMA;
2052       break;
2053     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2054     case Intrinsic::lifetime_start:
2055     case Intrinsic::lifetime_end:
2056     case Intrinsic::sideeffect:
2057     case Intrinsic::pseudoprobe:
2058     case Intrinsic::arithmetic_fence:
2059       return 0;
2060     case Intrinsic::masked_store: {
2061       Type *Ty = Tys[0];
2062       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2063       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2064                                             CostKind);
2065     }
2066     case Intrinsic::masked_load: {
2067       Type *Ty = RetTy;
2068       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2069       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2070                                             CostKind);
2071     }
2072     case Intrinsic::vector_reduce_add:
2073     case Intrinsic::vector_reduce_mul:
2074     case Intrinsic::vector_reduce_and:
2075     case Intrinsic::vector_reduce_or:
2076     case Intrinsic::vector_reduce_xor:
2077       return thisT()->getArithmeticReductionCost(
2078           getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2079           CostKind);
2080     case Intrinsic::vector_reduce_fadd:
2081     case Intrinsic::vector_reduce_fmul:
2082       return thisT()->getArithmeticReductionCost(
2083           getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2084     case Intrinsic::vector_reduce_smax:
2085     case Intrinsic::vector_reduce_smin:
2086     case Intrinsic::vector_reduce_umax:
2087     case Intrinsic::vector_reduce_umin:
2088     case Intrinsic::vector_reduce_fmax:
2089     case Intrinsic::vector_reduce_fmin:
2090     case Intrinsic::vector_reduce_fmaximum:
2091     case Intrinsic::vector_reduce_fminimum:
2092       return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2093                                              VecOpTy, ICA.getFlags(), CostKind);
2094     case Intrinsic::abs: {
2095       // abs(X) = select(icmp(X,0),X,sub(0,X))
2096       Type *CondTy = RetTy->getWithNewBitWidth(1);
2097       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2098       InstructionCost Cost = 0;
2099       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2100                                           Pred, CostKind);
2101       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2102                                           Pred, CostKind);
2103       // TODO: Should we add an OperandValueProperties::OP_Zero property?
2104       Cost += thisT()->getArithmeticInstrCost(
2105          BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2106       return Cost;
2107     }
2108     case Intrinsic::smax:
2109     case Intrinsic::smin:
2110     case Intrinsic::umax:
2111     case Intrinsic::umin: {
2112       // minmax(X,Y) = select(icmp(X,Y),X,Y)
2113       Type *CondTy = RetTy->getWithNewBitWidth(1);
2114       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2115       CmpInst::Predicate Pred =
2116           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2117       InstructionCost Cost = 0;
2118       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2119                                           Pred, CostKind);
2120       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2121                                           Pred, CostKind);
2122       return Cost;
2123     }
2124     case Intrinsic::sadd_sat:
2125     case Intrinsic::ssub_sat: {
2126       Type *CondTy = RetTy->getWithNewBitWidth(1);
2127 
2128       Type *OpTy = StructType::create({RetTy, CondTy});
2129       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2130                                      ? Intrinsic::sadd_with_overflow
2131                                      : Intrinsic::ssub_with_overflow;
2132       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2133 
2134       // SatMax -> Overflow && SumDiff < 0
2135       // SatMin -> Overflow && SumDiff >= 0
2136       InstructionCost Cost = 0;
2137       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2138                                     nullptr, ScalarizationCostPassed);
2139       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2140       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2141                                           Pred, CostKind);
2142       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2143                                               CondTy, Pred, CostKind);
2144       return Cost;
2145     }
2146     case Intrinsic::uadd_sat:
2147     case Intrinsic::usub_sat: {
2148       Type *CondTy = RetTy->getWithNewBitWidth(1);
2149 
2150       Type *OpTy = StructType::create({RetTy, CondTy});
2151       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2152                                      ? Intrinsic::uadd_with_overflow
2153                                      : Intrinsic::usub_with_overflow;
2154 
2155       InstructionCost Cost = 0;
2156       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2157                                     nullptr, ScalarizationCostPassed);
2158       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2159       Cost +=
2160           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2161                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2162       return Cost;
2163     }
2164     case Intrinsic::smul_fix:
2165     case Intrinsic::umul_fix: {
2166       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2167       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2168 
2169       unsigned ExtOp =
2170           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2171       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2172 
2173       InstructionCost Cost = 0;
2174       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2175       Cost +=
2176           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2177       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2178                                             CCH, CostKind);
2179       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2180                                               CostKind,
2181                                               {TTI::OK_AnyValue, TTI::OP_None},
2182                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2183       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2184                                               {TTI::OK_AnyValue, TTI::OP_None},
2185                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2186       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2187       return Cost;
2188     }
2189     case Intrinsic::sadd_with_overflow:
2190     case Intrinsic::ssub_with_overflow: {
2191       Type *SumTy = RetTy->getContainedType(0);
2192       Type *OverflowTy = RetTy->getContainedType(1);
2193       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2194                             ? BinaryOperator::Add
2195                             : BinaryOperator::Sub;
2196 
2197       //   Add:
2198       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2199       //   Sub:
2200       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2201       InstructionCost Cost = 0;
2202       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2203       Cost += 2 * thisT()->getCmpSelInstrCost(
2204                       Instruction::ICmp, SumTy, OverflowTy,
2205                       CmpInst::ICMP_SGT, CostKind);
2206       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2207                                               CostKind);
2208       return Cost;
2209     }
2210     case Intrinsic::uadd_with_overflow:
2211     case Intrinsic::usub_with_overflow: {
2212       Type *SumTy = RetTy->getContainedType(0);
2213       Type *OverflowTy = RetTy->getContainedType(1);
2214       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2215                             ? BinaryOperator::Add
2216                             : BinaryOperator::Sub;
2217       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2218                                     ? CmpInst::ICMP_ULT
2219                                     : CmpInst::ICMP_UGT;
2220 
2221       InstructionCost Cost = 0;
2222       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2223       Cost +=
2224           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2225                                       Pred, CostKind);
2226       return Cost;
2227     }
2228     case Intrinsic::smul_with_overflow:
2229     case Intrinsic::umul_with_overflow: {
2230       Type *MulTy = RetTy->getContainedType(0);
2231       Type *OverflowTy = RetTy->getContainedType(1);
2232       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2233       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2234       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2235 
2236       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2237       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2238 
2239       InstructionCost Cost = 0;
2240       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2241       Cost +=
2242           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2243       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2244                                             CCH, CostKind);
2245       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2246                                               CostKind,
2247                                               {TTI::OK_AnyValue, TTI::OP_None},
2248                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2249 
2250       if (IsSigned)
2251         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2252                                                 CostKind,
2253                                                 {TTI::OK_AnyValue, TTI::OP_None},
2254                                                 {TTI::OK_UniformConstantValue, TTI::OP_None});
2255 
2256       Cost += thisT()->getCmpSelInstrCost(
2257           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2258       return Cost;
2259     }
2260     case Intrinsic::fptosi_sat:
2261     case Intrinsic::fptoui_sat: {
2262       if (Tys.empty())
2263         break;
2264       Type *FromTy = Tys[0];
2265       bool IsSigned = IID == Intrinsic::fptosi_sat;
2266 
2267       InstructionCost Cost = 0;
2268       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2269                                      {FromTy, FromTy});
2270       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2271       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2272                                      {FromTy, FromTy});
2273       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2274       Cost += thisT()->getCastInstrCost(
2275           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2276           TTI::CastContextHint::None, CostKind);
2277       if (IsSigned) {
2278         Type *CondTy = RetTy->getWithNewBitWidth(1);
2279         Cost += thisT()->getCmpSelInstrCost(
2280             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2281         Cost += thisT()->getCmpSelInstrCost(
2282             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2283       }
2284       return Cost;
2285     }
2286     case Intrinsic::ctpop:
2287       ISD = ISD::CTPOP;
2288       // In case of legalization use TCC_Expensive. This is cheaper than a
2289       // library call but still not a cheap instruction.
2290       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2291       break;
2292     case Intrinsic::ctlz:
2293       ISD = ISD::CTLZ;
2294       break;
2295     case Intrinsic::cttz:
2296       ISD = ISD::CTTZ;
2297       break;
2298     case Intrinsic::bswap:
2299       ISD = ISD::BSWAP;
2300       break;
2301     case Intrinsic::bitreverse:
2302       ISD = ISD::BITREVERSE;
2303       break;
2304     }
2305 
2306     const TargetLoweringBase *TLI = getTLI();
2307     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2308 
2309     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2310       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2311           TLI->isFAbsFree(LT.second)) {
2312         return 0;
2313       }
2314 
2315       // The operation is legal. Assume it costs 1.
2316       // If the type is split to multiple registers, assume that there is some
2317       // overhead to this.
2318       // TODO: Once we have extract/insert subvector cost we need to use them.
2319       if (LT.first > 1)
2320         return (LT.first * 2);
2321       else
2322         return (LT.first * 1);
2323     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2324       // If the operation is custom lowered then assume
2325       // that the code is twice as expensive.
2326       return (LT.first * 2);
2327     }
2328 
2329     // If we can't lower fmuladd into an FMA estimate the cost as a floating
2330     // point mul followed by an add.
2331     if (IID == Intrinsic::fmuladd)
2332       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2333                                              CostKind) +
2334              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2335                                              CostKind);
2336     if (IID == Intrinsic::experimental_constrained_fmuladd) {
2337       IntrinsicCostAttributes FMulAttrs(
2338         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2339       IntrinsicCostAttributes FAddAttrs(
2340         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2341       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2342              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2343     }
2344 
2345     // Else, assume that we need to scalarize this intrinsic. For math builtins
2346     // this will emit a costly libcall, adding call overhead and spills. Make it
2347     // very expensive.
2348     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2349       // Scalable vectors cannot be scalarized, so return Invalid.
2350       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2351             return isa<ScalableVectorType>(Ty);
2352           }))
2353         return InstructionCost::getInvalid();
2354 
2355       InstructionCost ScalarizationCost =
2356           SkipScalarizationCost
2357               ? ScalarizationCostPassed
2358               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2359                                          /*Extract*/ false, CostKind);
2360 
2361       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2362       SmallVector<Type *, 4> ScalarTys;
2363       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2364         Type *Ty = Tys[i];
2365         if (Ty->isVectorTy())
2366           Ty = Ty->getScalarType();
2367         ScalarTys.push_back(Ty);
2368       }
2369       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2370       InstructionCost ScalarCost =
2371           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2372       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2373         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2374           if (!ICA.skipScalarizationCost())
2375             ScalarizationCost += getScalarizationOverhead(
2376                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2377           ScalarCalls = std::max(ScalarCalls,
2378                                  cast<FixedVectorType>(VTy)->getNumElements());
2379         }
2380       }
2381       return ScalarCalls * ScalarCost + ScalarizationCost;
2382     }
2383 
2384     // This is going to be turned into a library call, make it expensive.
2385     return SingleCallCost;
2386   }
2387 
2388   /// Compute a cost of the given call instruction.
2389   ///
2390   /// Compute the cost of calling function F with return type RetTy and
2391   /// argument types Tys. F might be nullptr, in this case the cost of an
2392   /// arbitrary call with the specified signature will be returned.
2393   /// This is used, for instance,  when we estimate call of a vector
2394   /// counterpart of the given function.
2395   /// \param F Called function, might be nullptr.
2396   /// \param RetTy Return value types.
2397   /// \param Tys Argument types.
2398   /// \returns The cost of Call instruction.
getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2399   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2400                                    ArrayRef<Type *> Tys,
2401                                    TTI::TargetCostKind CostKind) {
2402     return 10;
2403   }
2404 
getNumberOfParts(Type * Tp)2405   unsigned getNumberOfParts(Type *Tp) {
2406     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2407     return LT.first.isValid() ? *LT.first.getValue() : 0;
2408   }
2409 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)2410   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2411                                             const SCEV *) {
2412     return 0;
2413   }
2414 
2415   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2416   /// We're assuming that reduction operation are performing the following way:
2417   ///
2418   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2419   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2420   ///            \----------------v-------------/  \----------v------------/
2421   ///                            n/2 elements               n/2 elements
2422   /// %red1 = op <n x t> %val, <n x t> val1
2423   /// After this operation we have a vector %red1 where only the first n/2
2424   /// elements are meaningful, the second n/2 elements are undefined and can be
2425   /// dropped. All other operations are actually working with the vector of
2426   /// length n/2, not n, though the real vector length is still n.
2427   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2428   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2429   ///            \----------------v-------------/  \----------v------------/
2430   ///                            n/4 elements               3*n/4 elements
2431   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2432   /// length n/2, the resulting vector has length n/4 etc.
2433   ///
2434   /// The cost model should take into account that the actual length of the
2435   /// vector is reduced on each iteration.
getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2436   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2437                                        TTI::TargetCostKind CostKind) {
2438     // Targets must implement a default value for the scalable case, since
2439     // we don't know how many lanes the vector has.
2440     if (isa<ScalableVectorType>(Ty))
2441       return InstructionCost::getInvalid();
2442 
2443     Type *ScalarTy = Ty->getElementType();
2444     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2445     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2446         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2447         NumVecElts >= 2) {
2448       // Or reduction for i1 is represented as:
2449       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2450       // %res = cmp ne iReduxWidth %val, 0
2451       // And reduction for i1 is represented as:
2452       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2453       // %res = cmp eq iReduxWidth %val, 11111
2454       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2455       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2456                                        TTI::CastContextHint::None, CostKind) +
2457              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2458                                          CmpInst::makeCmpResultType(ValTy),
2459                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2460     }
2461     unsigned NumReduxLevels = Log2_32(NumVecElts);
2462     InstructionCost ArithCost = 0;
2463     InstructionCost ShuffleCost = 0;
2464     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2465     unsigned LongVectorCount = 0;
2466     unsigned MVTLen =
2467         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2468     while (NumVecElts > MVTLen) {
2469       NumVecElts /= 2;
2470       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2471       ShuffleCost +=
2472           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2473                                   CostKind, NumVecElts, SubTy);
2474       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2475       Ty = SubTy;
2476       ++LongVectorCount;
2477     }
2478 
2479     NumReduxLevels -= LongVectorCount;
2480 
2481     // The minimal length of the vector is limited by the real length of vector
2482     // operations performed on the current platform. That's why several final
2483     // reduction operations are performed on the vectors with the same
2484     // architecture-dependent length.
2485 
2486     // By default reductions need one shuffle per reduction level.
2487     ShuffleCost +=
2488         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2489                                                  std::nullopt, CostKind, 0, Ty);
2490     ArithCost +=
2491         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2492     return ShuffleCost + ArithCost +
2493            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2494                                        CostKind, 0, nullptr, nullptr);
2495   }
2496 
2497   /// Try to calculate the cost of performing strict (in-order) reductions,
2498   /// which involves doing a sequence of floating point additions in lane
2499   /// order, starting with an initial value. For example, consider a scalar
2500   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2501   ///
2502   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2503   ///
2504   ///   %add1 = %InitVal + %v0
2505   ///   %add2 = %add1 + %v1
2506   ///   %add3 = %add2 + %v2
2507   ///   %add4 = %add3 + %v3
2508   ///
2509   /// As a simple estimate we can say the cost of such a reduction is 4 times
2510   /// the cost of a scalar FP addition. We can only estimate the costs for
2511   /// fixed-width vectors here because for scalable vectors we do not know the
2512   /// runtime number of operations.
getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2513   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2514                                           TTI::TargetCostKind CostKind) {
2515     // Targets must implement a default value for the scalable case, since
2516     // we don't know how many lanes the vector has.
2517     if (isa<ScalableVectorType>(Ty))
2518       return InstructionCost::getInvalid();
2519 
2520     auto *VTy = cast<FixedVectorType>(Ty);
2521     InstructionCost ExtractCost = getScalarizationOverhead(
2522         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2523     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2524         Opcode, VTy->getElementType(), CostKind);
2525     ArithCost *= VTy->getNumElements();
2526 
2527     return ExtractCost + ArithCost;
2528   }
2529 
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2530   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2531                                              std::optional<FastMathFlags> FMF,
2532                                              TTI::TargetCostKind CostKind) {
2533     assert(Ty && "Unknown reduction vector type");
2534     if (TTI::requiresOrderedReduction(FMF))
2535       return getOrderedReductionCost(Opcode, Ty, CostKind);
2536     return getTreeReductionCost(Opcode, Ty, CostKind);
2537   }
2538 
2539   /// Try to calculate op costs for min/max reduction operations.
2540   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2541   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2542                                          FastMathFlags FMF,
2543                                          TTI::TargetCostKind CostKind) {
2544     // Targets must implement a default value for the scalable case, since
2545     // we don't know how many lanes the vector has.
2546     if (isa<ScalableVectorType>(Ty))
2547       return InstructionCost::getInvalid();
2548 
2549     Type *ScalarTy = Ty->getElementType();
2550     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2551     unsigned NumReduxLevels = Log2_32(NumVecElts);
2552     InstructionCost MinMaxCost = 0;
2553     InstructionCost ShuffleCost = 0;
2554     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2555     unsigned LongVectorCount = 0;
2556     unsigned MVTLen =
2557         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2558     while (NumVecElts > MVTLen) {
2559       NumVecElts /= 2;
2560       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2561 
2562       ShuffleCost +=
2563           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2564                                   CostKind, NumVecElts, SubTy);
2565 
2566       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2567       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2568       Ty = SubTy;
2569       ++LongVectorCount;
2570     }
2571 
2572     NumReduxLevels -= LongVectorCount;
2573 
2574     // The minimal length of the vector is limited by the real length of vector
2575     // operations performed on the current platform. That's why several final
2576     // reduction opertions are perfomed on the vectors with the same
2577     // architecture-dependent length.
2578     ShuffleCost +=
2579         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2580                                                  std::nullopt, CostKind, 0, Ty);
2581     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2582     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2583     // The last min/max should be in vector registers and we counted it above.
2584     // So just need a single extractelement.
2585     return ShuffleCost + MinMaxCost +
2586            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2587                                        CostKind, 0, nullptr, nullptr);
2588   }
2589 
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2590   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2591                                            Type *ResTy, VectorType *Ty,
2592                                            FastMathFlags FMF,
2593                                            TTI::TargetCostKind CostKind) {
2594     // Without any native support, this is equivalent to the cost of
2595     // vecreduce.opcode(ext(Ty A)).
2596     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2597     InstructionCost RedCost =
2598         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2599     InstructionCost ExtCost = thisT()->getCastInstrCost(
2600         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2601         TTI::CastContextHint::None, CostKind);
2602 
2603     return RedCost + ExtCost;
2604   }
2605 
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2606   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2607                                          VectorType *Ty,
2608                                          TTI::TargetCostKind CostKind) {
2609     // Without any native support, this is equivalent to the cost of
2610     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2611     // vecreduce.add(mul(A, B)).
2612     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2613     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2614         Instruction::Add, ExtTy, std::nullopt, CostKind);
2615     InstructionCost ExtCost = thisT()->getCastInstrCost(
2616         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2617         TTI::CastContextHint::None, CostKind);
2618 
2619     InstructionCost MulCost =
2620         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2621 
2622     return RedCost + MulCost + 2 * ExtCost;
2623   }
2624 
getVectorSplitCost()2625   InstructionCost getVectorSplitCost() { return 1; }
2626 
2627   /// @}
2628 };
2629 
2630 /// Concrete BasicTTIImpl that can be used if no further customization
2631 /// is needed.
2632 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2633   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2634 
2635   friend class BasicTTIImplBase<BasicTTIImpl>;
2636 
2637   const TargetSubtargetInfo *ST;
2638   const TargetLoweringBase *TLI;
2639 
getST()2640   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2641   const TargetLoweringBase *getTLI() const { return TLI; }
2642 
2643 public:
2644   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2645 };
2646 
2647 } // end namespace llvm
2648 
2649 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2650