1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/CodeGen/ISDOpcodes.h" 30 #include "llvm/CodeGen/TargetLowering.h" 31 #include "llvm/CodeGen/TargetSubtargetInfo.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/CodeGenTypes/MachineValueType.h" 34 #include "llvm/IR/BasicBlock.h" 35 #include "llvm/IR/Constant.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DerivedTypes.h" 39 #include "llvm/IR/InstrTypes.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Operator.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CommandLine.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/MathExtras.h" 50 #include "llvm/Target/TargetMachine.h" 51 #include "llvm/Target/TargetOptions.h" 52 #include "llvm/Transforms/Utils/LoopUtils.h" 53 #include <algorithm> 54 #include <cassert> 55 #include <cstdint> 56 #include <limits> 57 #include <optional> 58 #include <utility> 59 60 namespace llvm { 61 62 class Function; 63 class GlobalValue; 64 class LLVMContext; 65 class ScalarEvolution; 66 class SCEV; 67 class TargetMachine; 68 69 extern cl::opt<unsigned> PartialUnrollingThreshold; 70 71 /// Base class which can be used to help build a TTI implementation. 72 /// 73 /// This class provides as much implementation of the TTI interface as is 74 /// possible using the target independent parts of the code generator. 75 /// 76 /// In order to subclass it, your class must implement a getST() method to 77 /// return the subtarget, and a getTLI() method to return the target lowering. 78 /// We need these methods implemented in the derived class so that this class 79 /// doesn't have to duplicate storage for them. 80 template <typename T> 81 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 82 private: 83 using BaseT = TargetTransformInfoImplCRTPBase<T>; 84 using TTI = TargetTransformInfo; 85 86 /// Helper function to access this as a T. thisT()87 T *thisT() { return static_cast<T *>(this); } 88 89 /// Estimate a cost of Broadcast as an extract and sequence of insert 90 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, 92 TTI::TargetCostKind CostKind) { 93 InstructionCost Cost = 0; 94 // Broadcast cost is equal to the cost of extracting the zero'th element 95 // plus the cost of inserting it into every element of the result vector. 96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 97 CostKind, 0, nullptr, nullptr); 98 99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 101 CostKind, i, nullptr, nullptr); 102 } 103 return Cost; 104 } 105 106 /// Estimate a cost of shuffle as a sequence of extract and insert 107 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, 109 TTI::TargetCostKind CostKind) { 110 InstructionCost Cost = 0; 111 // Shuffle cost is equal to the cost of extracting element from its argument 112 // plus the cost of inserting them onto the result vector. 113 114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 115 // index 0 of first vector, index 1 of second vector,index 2 of first 116 // vector and finally index 3 of second vector and insert them at index 117 // <0,1,2,3> of result vector. 118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 120 CostKind, i, nullptr, nullptr); 121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 122 CostKind, i, nullptr, nullptr); 123 } 124 return Cost; 125 } 126 127 /// Estimate a cost of subvector extraction as a sequence of extract and 128 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, 130 TTI::TargetCostKind CostKind, 131 int Index, 132 FixedVectorType *SubVTy) { 133 assert(VTy && SubVTy && 134 "Can only extract subvectors from vectors"); 135 int NumSubElts = SubVTy->getNumElements(); 136 assert((!isa<FixedVectorType>(VTy) || 137 (Index + NumSubElts) <= 138 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 139 "SK_ExtractSubvector index out of range"); 140 141 InstructionCost Cost = 0; 142 // Subvector extraction cost is equal to the cost of extracting element from 143 // the source type plus the cost of inserting them into the result vector 144 // type. 145 for (int i = 0; i != NumSubElts; ++i) { 146 Cost += 147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 148 CostKind, i + Index, nullptr, nullptr); 149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, 150 CostKind, i, nullptr, nullptr); 151 } 152 return Cost; 153 } 154 155 /// Estimate a cost of subvector insertion as a sequence of extract and 156 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, 158 TTI::TargetCostKind CostKind, 159 int Index, 160 FixedVectorType *SubVTy) { 161 assert(VTy && SubVTy && 162 "Can only insert subvectors into vectors"); 163 int NumSubElts = SubVTy->getNumElements(); 164 assert((!isa<FixedVectorType>(VTy) || 165 (Index + NumSubElts) <= 166 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 167 "SK_InsertSubvector index out of range"); 168 169 InstructionCost Cost = 0; 170 // Subvector insertion cost is equal to the cost of extracting element from 171 // the source type plus the cost of inserting them into the result vector 172 // type. 173 for (int i = 0; i != NumSubElts; ++i) { 174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, 175 CostKind, i, nullptr, nullptr); 176 Cost += 177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, 178 i + Index, nullptr, nullptr); 179 } 180 return Cost; 181 } 182 183 /// Local query method delegates up to T which *must* implement this! getST()184 const TargetSubtargetInfo *getST() const { 185 return static_cast<const T *>(this)->getST(); 186 } 187 188 /// Local query method delegates up to T which *must* implement this! getTLI()189 const TargetLoweringBase *getTLI() const { 190 return static_cast<const T *>(this)->getTLI(); 191 } 192 getISDIndexedMode(TTI::MemIndexedMode M)193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 194 switch (M) { 195 case TTI::MIM_Unindexed: 196 return ISD::UNINDEXED; 197 case TTI::MIM_PreInc: 198 return ISD::PRE_INC; 199 case TTI::MIM_PreDec: 200 return ISD::PRE_DEC; 201 case TTI::MIM_PostInc: 202 return ISD::POST_INC; 203 case TTI::MIM_PostDec: 204 return ISD::POST_DEC; 205 } 206 llvm_unreachable("Unexpected MemIndexedMode"); 207 } 208 209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 210 Align Alignment, 211 bool VariableMask, 212 bool IsGatherScatter, 213 TTI::TargetCostKind CostKind, 214 unsigned AddressSpace = 0) { 215 // We cannot scalarize scalable vectors, so return Invalid. 216 if (isa<ScalableVectorType>(DataTy)) 217 return InstructionCost::getInvalid(); 218 219 auto *VT = cast<FixedVectorType>(DataTy); 220 unsigned VF = VT->getNumElements(); 221 222 // Assume the target does not have support for gather/scatter operations 223 // and provide a rough estimate. 224 // 225 // First, compute the cost of the individual memory operations. 226 InstructionCost AddrExtractCost = 227 IsGatherScatter 228 ? getScalarizationOverhead( 229 FixedVectorType::get( 230 PointerType::get(VT->getElementType(), 0), VF), 231 /*Insert=*/false, /*Extract=*/true, CostKind) 232 : 0; 233 234 // The cost of the scalar loads/stores. 235 InstructionCost MemoryOpCost = 236 VF * getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 237 AddressSpace, CostKind); 238 239 // Next, compute the cost of packing the result in a vector. 240 InstructionCost PackingCost = 241 getScalarizationOverhead(VT, Opcode != Instruction::Store, 242 Opcode == Instruction::Store, CostKind); 243 244 InstructionCost ConditionalCost = 0; 245 if (VariableMask) { 246 // Compute the cost of conditionally executing the memory operations with 247 // variable masks. This includes extracting the individual conditions, a 248 // branches and PHIs to combine the results. 249 // NOTE: Estimating the cost of conditionally executing the memory 250 // operations accurately is quite difficult and the current solution 251 // provides a very rough estimate only. 252 ConditionalCost = 253 getScalarizationOverhead( 254 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF), 255 /*Insert=*/false, /*Extract=*/true, CostKind) + 256 VF * (getCFInstrCost(Instruction::Br, CostKind) + 257 getCFInstrCost(Instruction::PHI, CostKind)); 258 } 259 260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost; 261 } 262 263 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)264 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 265 : BaseT(DL) {} 266 virtual ~BasicTTIImplBase() = default; 267 268 using TargetTransformInfoImplBase::DL; 269 270 public: 271 /// \name Scalar TTI Implementations 272 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,unsigned * Fast)273 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 274 unsigned AddressSpace, Align Alignment, 275 unsigned *Fast) const { 276 EVT E = EVT::getIntegerVT(Context, BitWidth); 277 return getTLI()->allowsMisalignedMemoryAccesses( 278 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 279 } 280 281 bool hasBranchDivergence(const Function *F = nullptr) { return false; } 282 isSourceOfDivergence(const Value * V)283 bool isSourceOfDivergence(const Value *V) { return false; } 284 isAlwaysUniform(const Value * V)285 bool isAlwaysUniform(const Value *V) { return false; } 286 isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 288 return false; 289 } 290 addrspacesMayAlias(unsigned AS0,unsigned AS1)291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 292 return true; 293 } 294 getFlatAddressSpace()295 unsigned getFlatAddressSpace() { 296 // Return an invalid address space. 297 return -1; 298 } 299 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)300 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 301 Intrinsic::ID IID) const { 302 return false; 303 } 304 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 307 } 308 getAssumedAddrSpace(const Value * V)309 unsigned getAssumedAddrSpace(const Value *V) const { 310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 311 } 312 isSingleThreaded()313 bool isSingleThreaded() const { 314 return getTLI()->getTargetMachine().Options.ThreadModel == 315 ThreadModel::Single; 316 } 317 318 std::pair<const Value *, unsigned> getPredicatedAddrSpace(const Value * V)319 getPredicatedAddrSpace(const Value *V) const { 320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 321 } 322 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)323 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 324 Value *NewV) const { 325 return nullptr; 326 } 327 isLegalAddImmediate(int64_t imm)328 bool isLegalAddImmediate(int64_t imm) { 329 return getTLI()->isLegalAddImmediate(imm); 330 } 331 isLegalAddScalableImmediate(int64_t Imm)332 bool isLegalAddScalableImmediate(int64_t Imm) { 333 return getTLI()->isLegalAddScalableImmediate(Imm); 334 } 335 isLegalICmpImmediate(int64_t imm)336 bool isLegalICmpImmediate(int64_t imm) { 337 return getTLI()->isLegalICmpImmediate(imm); 338 } 339 340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace, 342 Instruction *I = nullptr, 343 int64_t ScalableOffset = 0) { 344 TargetLoweringBase::AddrMode AM; 345 AM.BaseGV = BaseGV; 346 AM.BaseOffs = BaseOffset; 347 AM.HasBaseReg = HasBaseReg; 348 AM.Scale = Scale; 349 AM.ScalableOffset = ScalableOffset; 350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 351 } 352 getPreferredLargeGEPBaseOffset(int64_t MinOffset,int64_t MaxOffset)353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { 354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); 355 } 356 getStoreMinimumVF(unsigned VF,Type * ScalarMemTy,Type * ScalarValTy)357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 358 Type *ScalarValTy) const { 359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 361 EVT VT = getTLI()->getValueType(DL, SrcTy); 362 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 363 getTLI()->isOperationCustom(ISD::STORE, VT)) 364 return true; 365 366 EVT ValVT = 367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 368 EVT LegalizedVT = 369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 371 }; 372 while (VF > 2 && IsSupportedByTarget(VF)) 373 VF /= 2; 374 return VF; 375 } 376 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)377 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 378 const DataLayout &DL) const { 379 EVT VT = getTLI()->getValueType(DL, Ty); 380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 381 } 382 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)383 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 384 const DataLayout &DL) const { 385 EVT VT = getTLI()->getValueType(DL, Ty); 386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 387 } 388 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)389 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 390 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 391 } 392 isNumRegsMajorCostOfLSR()393 bool isNumRegsMajorCostOfLSR() { 394 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 395 } 396 shouldFoldTerminatingConditionAfterLSR()397 bool shouldFoldTerminatingConditionAfterLSR() const { 398 return TargetTransformInfoImplBase:: 399 shouldFoldTerminatingConditionAfterLSR(); 400 } 401 isProfitableLSRChainElement(Instruction * I)402 bool isProfitableLSRChainElement(Instruction *I) { 403 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 404 } 405 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)406 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 407 int64_t BaseOffset, bool HasBaseReg, 408 int64_t Scale, unsigned AddrSpace) { 409 TargetLoweringBase::AddrMode AM; 410 AM.BaseGV = BaseGV; 411 AM.BaseOffs = BaseOffset; 412 AM.HasBaseReg = HasBaseReg; 413 AM.Scale = Scale; 414 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 415 return 0; 416 return -1; 417 } 418 isTruncateFree(Type * Ty1,Type * Ty2)419 bool isTruncateFree(Type *Ty1, Type *Ty2) { 420 return getTLI()->isTruncateFree(Ty1, Ty2); 421 } 422 isProfitableToHoist(Instruction * I)423 bool isProfitableToHoist(Instruction *I) { 424 return getTLI()->isProfitableToHoist(I); 425 } 426 useAA()427 bool useAA() const { return getST()->useAA(); } 428 isTypeLegal(Type * Ty)429 bool isTypeLegal(Type *Ty) { 430 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true); 431 return getTLI()->isTypeLegal(VT); 432 } 433 getRegUsageForType(Type * Ty)434 unsigned getRegUsageForType(Type *Ty) { 435 EVT ETy = getTLI()->getValueType(DL, Ty); 436 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 437 } 438 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,Type * AccessType,TTI::TargetCostKind CostKind)439 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 440 ArrayRef<const Value *> Operands, Type *AccessType, 441 TTI::TargetCostKind CostKind) { 442 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); 443 } 444 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)445 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 446 unsigned &JumpTableSize, 447 ProfileSummaryInfo *PSI, 448 BlockFrequencyInfo *BFI) { 449 /// Try to find the estimated number of clusters. Note that the number of 450 /// clusters identified in this function could be different from the actual 451 /// numbers found in lowering. This function ignore switches that are 452 /// lowered with a mix of jump table / bit test / BTree. This function was 453 /// initially intended to be used when estimating the cost of switch in 454 /// inline cost heuristic, but it's a generic cost model to be used in other 455 /// places (e.g., in loop unrolling). 456 unsigned N = SI.getNumCases(); 457 const TargetLoweringBase *TLI = getTLI(); 458 const DataLayout &DL = this->getDataLayout(); 459 460 JumpTableSize = 0; 461 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 462 463 // Early exit if both a jump table and bit test are not allowed. 464 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 465 return N; 466 467 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 468 APInt MinCaseVal = MaxCaseVal; 469 for (auto CI : SI.cases()) { 470 const APInt &CaseVal = CI.getCaseValue()->getValue(); 471 if (CaseVal.sgt(MaxCaseVal)) 472 MaxCaseVal = CaseVal; 473 if (CaseVal.slt(MinCaseVal)) 474 MinCaseVal = CaseVal; 475 } 476 477 // Check if suitable for a bit test 478 if (N <= DL.getIndexSizeInBits(0u)) { 479 SmallPtrSet<const BasicBlock *, 4> Dests; 480 for (auto I : SI.cases()) 481 Dests.insert(I.getCaseSuccessor()); 482 483 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 484 DL)) 485 return 1; 486 } 487 488 // Check if suitable for a jump table. 489 if (IsJTAllowed) { 490 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 491 return N; 492 uint64_t Range = 493 (MaxCaseVal - MinCaseVal) 494 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 495 // Check whether a range of clusters is dense enough for a jump table 496 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 497 JumpTableSize = Range; 498 return 1; 499 } 500 } 501 return N; 502 } 503 shouldBuildLookupTables()504 bool shouldBuildLookupTables() { 505 const TargetLoweringBase *TLI = getTLI(); 506 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 507 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 508 } 509 shouldBuildRelLookupTables()510 bool shouldBuildRelLookupTables() const { 511 const TargetMachine &TM = getTLI()->getTargetMachine(); 512 // If non-PIC mode, do not generate a relative lookup table. 513 if (!TM.isPositionIndependent()) 514 return false; 515 516 /// Relative lookup table entries consist of 32-bit offsets. 517 /// Do not generate relative lookup tables for large code models 518 /// in 64-bit achitectures where 32-bit offsets might not be enough. 519 if (TM.getCodeModel() == CodeModel::Medium || 520 TM.getCodeModel() == CodeModel::Large) 521 return false; 522 523 Triple TargetTriple = TM.getTargetTriple(); 524 if (!TargetTriple.isArch64Bit()) 525 return false; 526 527 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 528 // there. 529 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 530 return false; 531 532 return true; 533 } 534 haveFastSqrt(Type * Ty)535 bool haveFastSqrt(Type *Ty) { 536 const TargetLoweringBase *TLI = getTLI(); 537 EVT VT = TLI->getValueType(DL, Ty); 538 return TLI->isTypeLegal(VT) && 539 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 540 } 541 isFCmpOrdCheaperThanFCmpZero(Type * Ty)542 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 543 return true; 544 } 545 getFPOpCost(Type * Ty)546 InstructionCost getFPOpCost(Type *Ty) { 547 // Check whether FADD is available, as a proxy for floating-point in 548 // general. 549 const TargetLoweringBase *TLI = getTLI(); 550 EVT VT = TLI->getValueType(DL, Ty); 551 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 552 return TargetTransformInfo::TCC_Basic; 553 return TargetTransformInfo::TCC_Expensive; 554 } 555 preferToKeepConstantsAttached(const Instruction & Inst,const Function & Fn)556 bool preferToKeepConstantsAttached(const Instruction &Inst, 557 const Function &Fn) const { 558 switch (Inst.getOpcode()) { 559 default: 560 break; 561 case Instruction::SDiv: 562 case Instruction::SRem: 563 case Instruction::UDiv: 564 case Instruction::URem: { 565 if (!isa<ConstantInt>(Inst.getOperand(1))) 566 return false; 567 EVT VT = getTLI()->getValueType(DL, Inst.getType()); 568 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); 569 } 570 }; 571 572 return false; 573 } 574 getInliningThresholdMultiplier()575 unsigned getInliningThresholdMultiplier() const { return 1; } adjustInliningThreshold(const CallBase * CB)576 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI)577 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { 578 return 0; 579 } 580 getInlinerVectorBonusPercent()581 int getInlinerVectorBonusPercent() const { return 150; } 582 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)583 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 584 TTI::UnrollingPreferences &UP, 585 OptimizationRemarkEmitter *ORE) { 586 // This unrolling functionality is target independent, but to provide some 587 // motivation for its intended use, for x86: 588 589 // According to the Intel 64 and IA-32 Architectures Optimization Reference 590 // Manual, Intel Core models and later have a loop stream detector (and 591 // associated uop queue) that can benefit from partial unrolling. 592 // The relevant requirements are: 593 // - The loop must have no more than 4 (8 for Nehalem and later) branches 594 // taken, and none of them may be calls. 595 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 596 597 // According to the Software Optimization Guide for AMD Family 15h 598 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 599 // and loop buffer which can benefit from partial unrolling. 600 // The relevant requirements are: 601 // - The loop must have fewer than 16 branches 602 // - The loop must have less than 40 uops in all executed loop branches 603 604 // The number of taken branches in a loop is hard to estimate here, and 605 // benchmarking has revealed that it is better not to be conservative when 606 // estimating the branch count. As a result, we'll ignore the branch limits 607 // until someone finds a case where it matters in practice. 608 609 unsigned MaxOps; 610 const TargetSubtargetInfo *ST = getST(); 611 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 612 MaxOps = PartialUnrollingThreshold; 613 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 614 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 615 else 616 return; 617 618 // Scan the loop: don't unroll loops with calls. 619 for (BasicBlock *BB : L->blocks()) { 620 for (Instruction &I : *BB) { 621 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 622 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 623 if (!thisT()->isLoweredToCall(F)) 624 continue; 625 } 626 627 if (ORE) { 628 ORE->emit([&]() { 629 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 630 L->getHeader()) 631 << "advising against unrolling the loop because it " 632 "contains a " 633 << ore::NV("Call", &I); 634 }); 635 } 636 return; 637 } 638 } 639 } 640 641 // Enable runtime and partial unrolling up to the specified size. 642 // Enable using trip count upper bound to unroll loops. 643 UP.Partial = UP.Runtime = UP.UpperBound = true; 644 UP.PartialThreshold = MaxOps; 645 646 // Avoid unrolling when optimizing for size. 647 UP.OptSizeThreshold = 0; 648 UP.PartialOptSizeThreshold = 0; 649 650 // Set number of instructions optimized when "back edge" 651 // becomes "fall through" to default value of 2. 652 UP.BEInsns = 2; 653 } 654 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)655 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 656 TTI::PeelingPreferences &PP) { 657 PP.PeelCount = 0; 658 PP.AllowPeeling = true; 659 PP.AllowLoopNestsPeeling = false; 660 PP.PeelProfiledIterations = true; 661 } 662 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)663 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 664 AssumptionCache &AC, 665 TargetLibraryInfo *LibInfo, 666 HardwareLoopInfo &HWLoopInfo) { 667 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 668 } 669 preferPredicateOverEpilogue(TailFoldingInfo * TFI)670 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 671 return BaseT::preferPredicateOverEpilogue(TFI); 672 } 673 674 TailFoldingStyle 675 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { 676 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); 677 } 678 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)679 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 680 IntrinsicInst &II) { 681 return BaseT::instCombineIntrinsic(IC, II); 682 } 683 684 std::optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)685 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 686 APInt DemandedMask, KnownBits &Known, 687 bool &KnownBitsComputed) { 688 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 689 KnownBitsComputed); 690 } 691 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)692 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 693 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 694 APInt &UndefElts2, APInt &UndefElts3, 695 std::function<void(Instruction *, unsigned, APInt, APInt &)> 696 SimplifyAndSetOp) { 697 return BaseT::simplifyDemandedVectorEltsIntrinsic( 698 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 699 SimplifyAndSetOp); 700 } 701 702 virtual std::optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)703 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 704 return std::optional<unsigned>( 705 getST()->getCacheSize(static_cast<unsigned>(Level))); 706 } 707 708 virtual std::optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)709 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 710 std::optional<unsigned> TargetResult = 711 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 712 713 if (TargetResult) 714 return TargetResult; 715 716 return BaseT::getCacheAssociativity(Level); 717 } 718 getCacheLineSize()719 virtual unsigned getCacheLineSize() const { 720 return getST()->getCacheLineSize(); 721 } 722 getPrefetchDistance()723 virtual unsigned getPrefetchDistance() const { 724 return getST()->getPrefetchDistance(); 725 } 726 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)727 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 728 unsigned NumStridedMemAccesses, 729 unsigned NumPrefetches, 730 bool HasCall) const { 731 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 732 NumPrefetches, HasCall); 733 } 734 getMaxPrefetchIterationsAhead()735 virtual unsigned getMaxPrefetchIterationsAhead() const { 736 return getST()->getMaxPrefetchIterationsAhead(); 737 } 738 enableWritePrefetching()739 virtual bool enableWritePrefetching() const { 740 return getST()->enableWritePrefetching(); 741 } 742 shouldPrefetchAddressSpace(unsigned AS)743 virtual bool shouldPrefetchAddressSpace(unsigned AS) const { 744 return getST()->shouldPrefetchAddressSpace(AS); 745 } 746 747 /// @} 748 749 /// \name Vector TTI Implementations 750 /// @{ 751 getRegisterBitWidth(TargetTransformInfo::RegisterKind K)752 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 753 return TypeSize::getFixed(32); 754 } 755 getMaxVScale()756 std::optional<unsigned> getMaxVScale() const { return std::nullopt; } getVScaleForTuning()757 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } isVScaleKnownToBeAPowerOfTwo()758 bool isVScaleKnownToBeAPowerOfTwo() const { return false; } 759 760 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 761 /// are set if the demanded result elements need to be inserted and/or 762 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)763 InstructionCost getScalarizationOverhead(VectorType *InTy, 764 const APInt &DemandedElts, 765 bool Insert, bool Extract, 766 TTI::TargetCostKind CostKind) { 767 /// FIXME: a bitfield is not a reasonable abstraction for talking about 768 /// which elements are needed from a scalable vector 769 if (isa<ScalableVectorType>(InTy)) 770 return InstructionCost::getInvalid(); 771 auto *Ty = cast<FixedVectorType>(InTy); 772 773 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 774 "Vector size mismatch"); 775 776 InstructionCost Cost = 0; 777 778 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 779 if (!DemandedElts[i]) 780 continue; 781 if (Insert) 782 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, 783 CostKind, i, nullptr, nullptr); 784 if (Extract) 785 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 786 CostKind, i, nullptr, nullptr); 787 } 788 789 return Cost; 790 } 791 792 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract,TTI::TargetCostKind CostKind)793 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 794 bool Extract, 795 TTI::TargetCostKind CostKind) { 796 if (isa<ScalableVectorType>(InTy)) 797 return InstructionCost::getInvalid(); 798 auto *Ty = cast<FixedVectorType>(InTy); 799 800 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 801 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 802 CostKind); 803 } 804 805 /// Estimate the overhead of scalarizing an instructions unique 806 /// non-constant operands. The (potentially vector) types to use for each of 807 /// argument are passes via Tys. 808 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)809 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 810 ArrayRef<Type *> Tys, 811 TTI::TargetCostKind CostKind) { 812 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 813 814 InstructionCost Cost = 0; 815 SmallPtrSet<const Value*, 4> UniqueOperands; 816 for (int I = 0, E = Args.size(); I != E; I++) { 817 // Disregard things like metadata arguments. 818 const Value *A = Args[I]; 819 Type *Ty = Tys[I]; 820 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 821 !Ty->isPtrOrPtrVectorTy()) 822 continue; 823 824 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 825 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 826 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, 827 /*Extract*/ true, CostKind); 828 } 829 } 830 831 return Cost; 832 } 833 834 /// Estimate the overhead of scalarizing the inputs and outputs of an 835 /// instruction, with return type RetTy and arguments Args of type Tys. If 836 /// Args are unknown (empty), then the cost associated with one argument is 837 /// added as a heuristic. getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)838 InstructionCost getScalarizationOverhead(VectorType *RetTy, 839 ArrayRef<const Value *> Args, 840 ArrayRef<Type *> Tys, 841 TTI::TargetCostKind CostKind) { 842 InstructionCost Cost = getScalarizationOverhead( 843 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); 844 if (!Args.empty()) 845 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); 846 else 847 // When no information on arguments is provided, we add the cost 848 // associated with one argument as a heuristic. 849 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, 850 /*Extract*/ true, CostKind); 851 852 return Cost; 853 } 854 855 /// Estimate the cost of type-legalization and the legalized type. getTypeLegalizationCost(Type * Ty)856 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { 857 LLVMContext &C = Ty->getContext(); 858 EVT MTy = getTLI()->getValueType(DL, Ty); 859 860 InstructionCost Cost = 1; 861 // We keep legalizing the type until we find a legal kind. We assume that 862 // the only operation that costs anything is the split. After splitting 863 // we need to handle two types. 864 while (true) { 865 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); 866 867 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { 868 // Ensure we return a sensible simple VT here, since many callers of 869 // this function require it. 870 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; 871 return std::make_pair(InstructionCost::getInvalid(), VT); 872 } 873 874 if (LK.first == TargetLoweringBase::TypeLegal) 875 return std::make_pair(Cost, MTy.getSimpleVT()); 876 877 if (LK.first == TargetLoweringBase::TypeSplitVector || 878 LK.first == TargetLoweringBase::TypeExpandInteger) 879 Cost *= 2; 880 881 // Do not loop with f128 type. 882 if (MTy == LK.second) 883 return std::make_pair(Cost, MTy.getSimpleVT()); 884 885 // Keep legalizing the type. 886 MTy = LK.second; 887 } 888 } 889 getMaxInterleaveFactor(ElementCount VF)890 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } 891 892 InstructionCost getArithmeticInstrCost( 893 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 894 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 895 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 896 ArrayRef<const Value *> Args = std::nullopt, 897 const Instruction *CxtI = nullptr) { 898 // Check if any of the operands are vector operands. 899 const TargetLoweringBase *TLI = getTLI(); 900 int ISD = TLI->InstructionOpcodeToISD(Opcode); 901 assert(ISD && "Invalid opcode"); 902 903 // TODO: Handle more cost kinds. 904 if (CostKind != TTI::TCK_RecipThroughput) 905 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 906 Opd1Info, Opd2Info, 907 Args, CxtI); 908 909 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 910 911 bool IsFloat = Ty->isFPOrFPVectorTy(); 912 // Assume that floating point arithmetic operations cost twice as much as 913 // integer operations. 914 InstructionCost OpCost = (IsFloat ? 2 : 1); 915 916 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 917 // The operation is legal. Assume it costs 1. 918 // TODO: Once we have extract/insert subvector cost we need to use them. 919 return LT.first * OpCost; 920 } 921 922 if (!TLI->isOperationExpand(ISD, LT.second)) { 923 // If the operation is custom lowered, then assume that the code is twice 924 // as expensive. 925 return LT.first * 2 * OpCost; 926 } 927 928 // An 'Expand' of URem and SRem is special because it may default 929 // to expanding the operation into a sequence of sub-operations 930 // i.e. X % Y -> X-(X/Y)*Y. 931 if (ISD == ISD::UREM || ISD == ISD::SREM) { 932 bool IsSigned = ISD == ISD::SREM; 933 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 934 LT.second) || 935 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 936 LT.second)) { 937 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 938 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 939 DivOpc, Ty, CostKind, Opd1Info, Opd2Info); 940 InstructionCost MulCost = 941 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 942 InstructionCost SubCost = 943 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 944 return DivCost + MulCost + SubCost; 945 } 946 } 947 948 // We cannot scalarize scalable vectors, so return Invalid. 949 if (isa<ScalableVectorType>(Ty)) 950 return InstructionCost::getInvalid(); 951 952 // Else, assume that we need to scalarize this op. 953 // TODO: If one of the types get legalized by splitting, handle this 954 // similarly to what getCastInstrCost() does. 955 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 956 InstructionCost Cost = thisT()->getArithmeticInstrCost( 957 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 958 Args, CxtI); 959 // Return the cost of multiple scalar invocation plus the cost of 960 // inserting and extracting the values. 961 SmallVector<Type *> Tys(Args.size(), Ty); 962 return getScalarizationOverhead(VTy, Args, Tys, CostKind) + 963 VTy->getNumElements() * Cost; 964 } 965 966 // We don't know anything about this scalar instruction. 967 return OpCost; 968 } 969 improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask,VectorType * Ty,int & Index,VectorType * & SubTy)970 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 971 ArrayRef<int> Mask, 972 VectorType *Ty, int &Index, 973 VectorType *&SubTy) const { 974 if (Mask.empty()) 975 return Kind; 976 int NumSrcElts = Ty->getElementCount().getKnownMinValue(); 977 switch (Kind) { 978 case TTI::SK_PermuteSingleSrc: 979 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) 980 return TTI::SK_Reverse; 981 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) 982 return TTI::SK_Broadcast; 983 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && 984 (Index + Mask.size()) <= (size_t)NumSrcElts) { 985 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); 986 return TTI::SK_ExtractSubvector; 987 } 988 break; 989 case TTI::SK_PermuteTwoSrc: { 990 int NumSubElts; 991 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 992 Mask, NumSrcElts, NumSubElts, Index)) { 993 if (Index + NumSubElts > NumSrcElts) 994 return Kind; 995 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); 996 return TTI::SK_InsertSubvector; 997 } 998 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) 999 return TTI::SK_Select; 1000 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) 1001 return TTI::SK_Transpose; 1002 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) 1003 return TTI::SK_Splice; 1004 break; 1005 } 1006 case TTI::SK_Select: 1007 case TTI::SK_Reverse: 1008 case TTI::SK_Broadcast: 1009 case TTI::SK_Transpose: 1010 case TTI::SK_InsertSubvector: 1011 case TTI::SK_ExtractSubvector: 1012 case TTI::SK_Splice: 1013 break; 1014 } 1015 return Kind; 1016 } 1017 1018 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1019 ArrayRef<int> Mask, 1020 TTI::TargetCostKind CostKind, int Index, 1021 VectorType *SubTp, 1022 ArrayRef<const Value *> Args = std::nullopt, 1023 const Instruction *CxtI = nullptr) { 1024 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { 1025 case TTI::SK_Broadcast: 1026 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1027 return getBroadcastShuffleOverhead(FVT, CostKind); 1028 return InstructionCost::getInvalid(); 1029 case TTI::SK_Select: 1030 case TTI::SK_Splice: 1031 case TTI::SK_Reverse: 1032 case TTI::SK_Transpose: 1033 case TTI::SK_PermuteSingleSrc: 1034 case TTI::SK_PermuteTwoSrc: 1035 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1036 return getPermuteShuffleOverhead(FVT, CostKind); 1037 return InstructionCost::getInvalid(); 1038 case TTI::SK_ExtractSubvector: 1039 return getExtractSubvectorOverhead(Tp, CostKind, Index, 1040 cast<FixedVectorType>(SubTp)); 1041 case TTI::SK_InsertSubvector: 1042 return getInsertSubvectorOverhead(Tp, CostKind, Index, 1043 cast<FixedVectorType>(SubTp)); 1044 } 1045 llvm_unreachable("Unknown TTI::ShuffleKind"); 1046 } 1047 1048 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1049 TTI::CastContextHint CCH, 1050 TTI::TargetCostKind CostKind, 1051 const Instruction *I = nullptr) { 1052 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 1053 return 0; 1054 1055 const TargetLoweringBase *TLI = getTLI(); 1056 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1057 assert(ISD && "Invalid opcode"); 1058 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1059 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1060 1061 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 1062 TypeSize DstSize = DstLT.second.getSizeInBits(); 1063 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 1064 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 1065 1066 switch (Opcode) { 1067 default: 1068 break; 1069 case Instruction::Trunc: 1070 // Check for NOOP conversions. 1071 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 1072 return 0; 1073 [[fallthrough]]; 1074 case Instruction::BitCast: 1075 // Bitcast between types that are legalized to the same type are free and 1076 // assume int to/from ptr of the same size is also free. 1077 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 1078 SrcSize == DstSize) 1079 return 0; 1080 break; 1081 case Instruction::FPExt: 1082 if (I && getTLI()->isExtFree(I)) 1083 return 0; 1084 break; 1085 case Instruction::ZExt: 1086 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 1087 return 0; 1088 [[fallthrough]]; 1089 case Instruction::SExt: 1090 if (I && getTLI()->isExtFree(I)) 1091 return 0; 1092 1093 // If this is a zext/sext of a load, return 0 if the corresponding 1094 // extending load exists on target and the result type is legal. 1095 if (CCH == TTI::CastContextHint::Normal) { 1096 EVT ExtVT = EVT::getEVT(Dst); 1097 EVT LoadVT = EVT::getEVT(Src); 1098 unsigned LType = 1099 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 1100 if (DstLT.first == SrcLT.first && 1101 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 1102 return 0; 1103 } 1104 break; 1105 case Instruction::AddrSpaceCast: 1106 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 1107 Dst->getPointerAddressSpace())) 1108 return 0; 1109 break; 1110 } 1111 1112 auto *SrcVTy = dyn_cast<VectorType>(Src); 1113 auto *DstVTy = dyn_cast<VectorType>(Dst); 1114 1115 // If the cast is marked as legal (or promote) then assume low cost. 1116 if (SrcLT.first == DstLT.first && 1117 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1118 return SrcLT.first; 1119 1120 // Handle scalar conversions. 1121 if (!SrcVTy && !DstVTy) { 1122 // Just check the op cost. If the operation is legal then assume it costs 1123 // 1. 1124 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1125 return 1; 1126 1127 // Assume that illegal scalar instruction are expensive. 1128 return 4; 1129 } 1130 1131 // Check vector-to-vector casts. 1132 if (DstVTy && SrcVTy) { 1133 // If the cast is between same-sized registers, then the check is simple. 1134 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1135 1136 // Assume that Zext is done using AND. 1137 if (Opcode == Instruction::ZExt) 1138 return SrcLT.first; 1139 1140 // Assume that sext is done using SHL and SRA. 1141 if (Opcode == Instruction::SExt) 1142 return SrcLT.first * 2; 1143 1144 // Just check the op cost. If the operation is legal then assume it 1145 // costs 1146 // 1 and multiply by the type-legalization overhead. 1147 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1148 return SrcLT.first * 1; 1149 } 1150 1151 // If we are legalizing by splitting, query the concrete TTI for the cost 1152 // of casting the original vector twice. We also need to factor in the 1153 // cost of the split itself. Count that as 1, to be consistent with 1154 // getTypeLegalizationCost(). 1155 bool SplitSrc = 1156 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1157 TargetLowering::TypeSplitVector; 1158 bool SplitDst = 1159 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1160 TargetLowering::TypeSplitVector; 1161 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1162 DstVTy->getElementCount().isVector()) { 1163 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1164 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1165 T *TTI = static_cast<T *>(this); 1166 // If both types need to be split then the split is free. 1167 InstructionCost SplitCost = 1168 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1169 return SplitCost + 1170 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1171 CostKind, I)); 1172 } 1173 1174 // Scalarization cost is Invalid, can't assume any num elements. 1175 if (isa<ScalableVectorType>(DstVTy)) 1176 return InstructionCost::getInvalid(); 1177 1178 // In other cases where the source or destination are illegal, assume 1179 // the operation will get scalarized. 1180 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1181 InstructionCost Cost = thisT()->getCastInstrCost( 1182 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1183 1184 // Return the cost of multiple scalar invocation plus the cost of 1185 // inserting and extracting the values. 1186 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, 1187 CostKind) + 1188 Num * Cost; 1189 } 1190 1191 // We already handled vector-to-vector and scalar-to-scalar conversions. 1192 // This 1193 // is where we handle bitcast between vectors and scalars. We need to assume 1194 // that the conversion is scalarized in one way or another. 1195 if (Opcode == Instruction::BitCast) { 1196 // Illegal bitcasts are done by storing and loading from a stack slot. 1197 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, 1198 /*Extract*/ true, CostKind) 1199 : 0) + 1200 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, 1201 /*Extract*/ false, CostKind) 1202 : 0); 1203 } 1204 1205 llvm_unreachable("Unhandled cast"); 1206 } 1207 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1208 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1209 VectorType *VecTy, unsigned Index) { 1210 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1211 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1212 CostKind, Index, nullptr, nullptr) + 1213 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1214 TTI::CastContextHint::None, CostKind); 1215 } 1216 1217 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1218 const Instruction *I = nullptr) { 1219 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1220 } 1221 1222 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1223 CmpInst::Predicate VecPred, 1224 TTI::TargetCostKind CostKind, 1225 const Instruction *I = nullptr) { 1226 const TargetLoweringBase *TLI = getTLI(); 1227 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1228 assert(ISD && "Invalid opcode"); 1229 1230 // TODO: Handle other cost kinds. 1231 if (CostKind != TTI::TCK_RecipThroughput) 1232 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1233 I); 1234 1235 // Selects on vectors are actually vector selects. 1236 if (ISD == ISD::SELECT) { 1237 assert(CondTy && "CondTy must exist"); 1238 if (CondTy->isVectorTy()) 1239 ISD = ISD::VSELECT; 1240 } 1241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1242 1243 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1244 !TLI->isOperationExpand(ISD, LT.second)) { 1245 // The operation is legal. Assume it costs 1. Multiply 1246 // by the type-legalization overhead. 1247 return LT.first * 1; 1248 } 1249 1250 // Otherwise, assume that the cast is scalarized. 1251 // TODO: If one of the types get legalized by splitting, handle this 1252 // similarly to what getCastInstrCost() does. 1253 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1254 if (isa<ScalableVectorType>(ValTy)) 1255 return InstructionCost::getInvalid(); 1256 1257 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1258 if (CondTy) 1259 CondTy = CondTy->getScalarType(); 1260 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1261 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1262 1263 // Return the cost of multiple scalar invocation plus the cost of 1264 // inserting and extracting the values. 1265 return getScalarizationOverhead(ValVTy, /*Insert*/ true, 1266 /*Extract*/ false, CostKind) + 1267 Num * Cost; 1268 } 1269 1270 // Unknown scalar opcode. 1271 return 1; 1272 } 1273 getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1274 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1275 TTI::TargetCostKind CostKind, 1276 unsigned Index, Value *Op0, Value *Op1) { 1277 return getRegUsageForType(Val->getScalarType()); 1278 } 1279 getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)1280 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1281 TTI::TargetCostKind CostKind, 1282 unsigned Index) { 1283 Value *Op0 = nullptr; 1284 Value *Op1 = nullptr; 1285 if (auto *IE = dyn_cast<InsertElementInst>(&I)) { 1286 Op0 = IE->getOperand(0); 1287 Op1 = IE->getOperand(1); 1288 } 1289 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, 1290 Op1); 1291 } 1292 getReplicationShuffleCost(Type * EltTy,int ReplicationFactor,int VF,const APInt & DemandedDstElts,TTI::TargetCostKind CostKind)1293 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1294 int VF, 1295 const APInt &DemandedDstElts, 1296 TTI::TargetCostKind CostKind) { 1297 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1298 "Unexpected size of DemandedDstElts."); 1299 1300 InstructionCost Cost; 1301 1302 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1303 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1304 1305 // The Mask shuffling cost is extract all the elements of the Mask 1306 // and insert each of them Factor times into the wide vector: 1307 // 1308 // E.g. an interleaved group with factor 3: 1309 // %mask = icmp ult <8 x i32> %vec1, %vec2 1310 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1311 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1312 // The cost is estimated as extract all mask elements from the <8xi1> mask 1313 // vector and insert them factor times into the <24xi1> shuffled mask 1314 // vector. 1315 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1316 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1317 /*Insert*/ false, 1318 /*Extract*/ true, CostKind); 1319 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1320 /*Insert*/ true, 1321 /*Extract*/ false, CostKind); 1322 1323 return Cost; 1324 } 1325 1326 InstructionCost 1327 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 1328 unsigned AddressSpace, TTI::TargetCostKind CostKind, 1329 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 1330 const Instruction *I = nullptr) { 1331 assert(!Src->isVoidTy() && "Invalid type"); 1332 // Assume types, such as structs, are expensive. 1333 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1334 return 4; 1335 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1336 1337 // Assuming that all loads of legal types cost 1. 1338 InstructionCost Cost = LT.first; 1339 if (CostKind != TTI::TCK_RecipThroughput) 1340 return Cost; 1341 1342 const DataLayout &DL = this->getDataLayout(); 1343 if (Src->isVectorTy() && 1344 // In practice it's not currently possible to have a change in lane 1345 // length for extending loads or truncating stores so both types should 1346 // have the same scalable property. 1347 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1348 LT.second.getSizeInBits())) { 1349 // This is a vector load that legalizes to a larger type than the vector 1350 // itself. Unless the corresponding extending load or truncating store is 1351 // legal, then this will scalarize. 1352 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1353 EVT MemVT = getTLI()->getValueType(DL, Src); 1354 if (Opcode == Instruction::Store) 1355 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1356 else 1357 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1358 1359 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1360 // This is a vector load/store for some illegal type that is scalarized. 1361 // We must account for the cost of building or decomposing the vector. 1362 Cost += getScalarizationOverhead( 1363 cast<VectorType>(Src), Opcode != Instruction::Store, 1364 Opcode == Instruction::Store, CostKind); 1365 } 1366 } 1367 1368 return Cost; 1369 } 1370 getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1371 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1372 Align Alignment, unsigned AddressSpace, 1373 TTI::TargetCostKind CostKind) { 1374 // TODO: Pass on AddressSpace when we have test coverage. 1375 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1376 CostKind); 1377 } 1378 1379 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1380 const Value *Ptr, bool VariableMask, 1381 Align Alignment, 1382 TTI::TargetCostKind CostKind, 1383 const Instruction *I = nullptr) { 1384 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1385 true, CostKind); 1386 } 1387 getStridedMemoryOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1388 InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, 1389 const Value *Ptr, bool VariableMask, 1390 Align Alignment, 1391 TTI::TargetCostKind CostKind, 1392 const Instruction *I) { 1393 // For a target without strided memory operations (or for an illegal 1394 // operation type on one which does), assume we lower to a gather/scatter 1395 // operation. (Which may in turn be scalarized.) 1396 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1397 Alignment, CostKind, I); 1398 } 1399 1400 InstructionCost getInterleavedMemoryOpCost( 1401 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1402 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1403 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1404 1405 // We cannot scalarize scalable vectors, so return Invalid. 1406 if (isa<ScalableVectorType>(VecTy)) 1407 return InstructionCost::getInvalid(); 1408 1409 auto *VT = cast<FixedVectorType>(VecTy); 1410 1411 unsigned NumElts = VT->getNumElements(); 1412 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1413 1414 unsigned NumSubElts = NumElts / Factor; 1415 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1416 1417 // Firstly, the cost of load/store operation. 1418 InstructionCost Cost; 1419 if (UseMaskForCond || UseMaskForGaps) 1420 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1421 AddressSpace, CostKind); 1422 else 1423 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1424 CostKind); 1425 1426 // Legalize the vector type, and get the legalized and unlegalized type 1427 // sizes. 1428 MVT VecTyLT = getTypeLegalizationCost(VecTy).second; 1429 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1430 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1431 1432 // Scale the cost of the memory operation by the fraction of legalized 1433 // instructions that will actually be used. We shouldn't account for the 1434 // cost of dead instructions since they will be removed. 1435 // 1436 // E.g., An interleaved load of factor 8: 1437 // %vec = load <16 x i64>, <16 x i64>* %ptr 1438 // %v0 = shufflevector %vec, undef, <0, 8> 1439 // 1440 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1441 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1442 // type). The other loads are unused. 1443 // 1444 // TODO: Note that legalization can turn masked loads/stores into unmasked 1445 // (legalized) loads/stores. This can be reflected in the cost. 1446 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1447 // The number of loads of a legal type it will take to represent a load 1448 // of the unlegalized vector type. 1449 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1450 1451 // The number of elements of the unlegalized type that correspond to a 1452 // single legal instruction. 1453 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1454 1455 // Determine which legal instructions will be used. 1456 BitVector UsedInsts(NumLegalInsts, false); 1457 for (unsigned Index : Indices) 1458 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1459 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1460 1461 // Scale the cost of the load by the fraction of legal instructions that 1462 // will be used. 1463 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1464 } 1465 1466 // Then plus the cost of interleave operation. 1467 assert(Indices.size() <= Factor && 1468 "Interleaved memory op has too many members"); 1469 1470 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1471 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1472 1473 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1474 for (unsigned Index : Indices) { 1475 assert(Index < Factor && "Invalid index for interleaved memory op"); 1476 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1477 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1478 } 1479 1480 if (Opcode == Instruction::Load) { 1481 // The interleave cost is similar to extract sub vectors' elements 1482 // from the wide vector, and insert them into sub vectors. 1483 // 1484 // E.g. An interleaved load of factor 2 (with one member of index 0): 1485 // %vec = load <8 x i32>, <8 x i32>* %ptr 1486 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1487 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1488 // <8 x i32> vector and insert them into a <4 x i32> vector. 1489 InstructionCost InsSubCost = thisT()->getScalarizationOverhead( 1490 SubVT, DemandedAllSubElts, 1491 /*Insert*/ true, /*Extract*/ false, CostKind); 1492 Cost += Indices.size() * InsSubCost; 1493 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1494 /*Insert*/ false, 1495 /*Extract*/ true, CostKind); 1496 } else { 1497 // The interleave cost is extract elements from sub vectors, and 1498 // insert them into the wide vector. 1499 // 1500 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1501 // (using VF=4): 1502 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1503 // %gaps.mask = <true, true, false, true, true, false, 1504 // true, true, false, true, true, false> 1505 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1506 // i32 Align, <12 x i1> %gaps.mask 1507 // The cost is estimated as extract all elements (of actual members, 1508 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1509 // i32> vector. 1510 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( 1511 SubVT, DemandedAllSubElts, 1512 /*Insert*/ false, /*Extract*/ true, CostKind); 1513 Cost += ExtSubCost * Indices.size(); 1514 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1515 /*Insert*/ true, 1516 /*Extract*/ false, CostKind); 1517 } 1518 1519 if (!UseMaskForCond) 1520 return Cost; 1521 1522 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1523 1524 Cost += thisT()->getReplicationShuffleCost( 1525 I8Type, Factor, NumSubElts, 1526 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1527 CostKind); 1528 1529 // The Gaps mask is invariant and created outside the loop, therefore the 1530 // cost of creating it is not accounted for here. However if we have both 1531 // a MaskForGaps and some other mask that guards the execution of the 1532 // memory access, we need to account for the cost of And-ing the two masks 1533 // inside the loop. 1534 if (UseMaskForGaps) { 1535 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1536 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1537 CostKind); 1538 } 1539 1540 return Cost; 1541 } 1542 1543 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1544 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1545 TTI::TargetCostKind CostKind) { 1546 // Check for generically free intrinsics. 1547 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1548 return 0; 1549 1550 // Assume that target intrinsics are cheap. 1551 Intrinsic::ID IID = ICA.getID(); 1552 if (Function::isTargetIntrinsic(IID)) 1553 return TargetTransformInfo::TCC_Basic; 1554 1555 if (ICA.isTypeBasedOnly()) 1556 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1557 1558 Type *RetTy = ICA.getReturnType(); 1559 1560 ElementCount RetVF = 1561 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1562 : ElementCount::getFixed(1)); 1563 const IntrinsicInst *I = ICA.getInst(); 1564 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1565 FastMathFlags FMF = ICA.getFlags(); 1566 switch (IID) { 1567 default: 1568 break; 1569 1570 case Intrinsic::powi: 1571 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1572 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1573 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1574 ShouldOptForSize)) { 1575 // The cost is modeled on the expansion performed by ExpandPowI in 1576 // SelectionDAGBuilder. 1577 APInt Exponent = RHSC->getValue().abs(); 1578 unsigned ActiveBits = Exponent.getActiveBits(); 1579 unsigned PopCount = Exponent.popcount(); 1580 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1581 thisT()->getArithmeticInstrCost( 1582 Instruction::FMul, RetTy, CostKind); 1583 if (RHSC->isNegative()) 1584 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1585 CostKind); 1586 return Cost; 1587 } 1588 } 1589 break; 1590 case Intrinsic::cttz: 1591 // FIXME: If necessary, this should go in target-specific overrides. 1592 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) 1593 return TargetTransformInfo::TCC_Basic; 1594 break; 1595 1596 case Intrinsic::ctlz: 1597 // FIXME: If necessary, this should go in target-specific overrides. 1598 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) 1599 return TargetTransformInfo::TCC_Basic; 1600 break; 1601 1602 case Intrinsic::memcpy: 1603 return thisT()->getMemcpyCost(ICA.getInst()); 1604 1605 case Intrinsic::masked_scatter: { 1606 const Value *Mask = Args[3]; 1607 bool VarMask = !isa<Constant>(Mask); 1608 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1609 return thisT()->getGatherScatterOpCost(Instruction::Store, 1610 ICA.getArgTypes()[0], Args[1], 1611 VarMask, Alignment, CostKind, I); 1612 } 1613 case Intrinsic::masked_gather: { 1614 const Value *Mask = Args[2]; 1615 bool VarMask = !isa<Constant>(Mask); 1616 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1617 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1618 VarMask, Alignment, CostKind, I); 1619 } 1620 case Intrinsic::experimental_vp_strided_store: { 1621 const Value *Data = Args[0]; 1622 const Value *Ptr = Args[1]; 1623 const Value *Mask = Args[3]; 1624 const Value *EVL = Args[4]; 1625 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1626 Align Alignment = I->getParamAlign(1).valueOrOne(); 1627 return thisT()->getStridedMemoryOpCost(Instruction::Store, 1628 Data->getType(), Ptr, VarMask, 1629 Alignment, CostKind, I); 1630 } 1631 case Intrinsic::experimental_vp_strided_load: { 1632 const Value *Ptr = Args[0]; 1633 const Value *Mask = Args[2]; 1634 const Value *EVL = Args[3]; 1635 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1636 Align Alignment = I->getParamAlign(0).valueOrOne(); 1637 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr, 1638 VarMask, Alignment, CostKind, I); 1639 } 1640 case Intrinsic::experimental_stepvector: { 1641 if (isa<ScalableVectorType>(RetTy)) 1642 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1643 // The cost of materialising a constant integer vector. 1644 return TargetTransformInfo::TCC_Basic; 1645 } 1646 case Intrinsic::vector_extract: { 1647 // FIXME: Handle case where a scalable vector is extracted from a scalable 1648 // vector 1649 if (isa<ScalableVectorType>(RetTy)) 1650 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1651 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1652 return thisT()->getShuffleCost( 1653 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()), 1654 std::nullopt, CostKind, Index, cast<VectorType>(RetTy)); 1655 } 1656 case Intrinsic::vector_insert: { 1657 // FIXME: Handle case where a scalable vector is inserted into a scalable 1658 // vector 1659 if (isa<ScalableVectorType>(Args[1]->getType())) 1660 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1661 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1662 return thisT()->getShuffleCost( 1663 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), 1664 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType())); 1665 } 1666 case Intrinsic::vector_reverse: { 1667 return thisT()->getShuffleCost( 1668 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt, 1669 CostKind, 0, cast<VectorType>(RetTy)); 1670 } 1671 case Intrinsic::vector_splice: { 1672 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1673 return thisT()->getShuffleCost( 1674 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt, 1675 CostKind, Index, cast<VectorType>(RetTy)); 1676 } 1677 case Intrinsic::vector_reduce_add: 1678 case Intrinsic::vector_reduce_mul: 1679 case Intrinsic::vector_reduce_and: 1680 case Intrinsic::vector_reduce_or: 1681 case Intrinsic::vector_reduce_xor: 1682 case Intrinsic::vector_reduce_smax: 1683 case Intrinsic::vector_reduce_smin: 1684 case Intrinsic::vector_reduce_fmax: 1685 case Intrinsic::vector_reduce_fmin: 1686 case Intrinsic::vector_reduce_fmaximum: 1687 case Intrinsic::vector_reduce_fminimum: 1688 case Intrinsic::vector_reduce_umax: 1689 case Intrinsic::vector_reduce_umin: { 1690 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1691 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1692 } 1693 case Intrinsic::vector_reduce_fadd: 1694 case Intrinsic::vector_reduce_fmul: { 1695 IntrinsicCostAttributes Attrs( 1696 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1697 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1698 } 1699 case Intrinsic::fshl: 1700 case Intrinsic::fshr: { 1701 const Value *X = Args[0]; 1702 const Value *Y = Args[1]; 1703 const Value *Z = Args[2]; 1704 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); 1705 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); 1706 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); 1707 const TTI::OperandValueInfo OpInfoBW = 1708 {TTI::OK_UniformConstantValue, 1709 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1710 : TTI::OP_None}; 1711 1712 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1713 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1714 InstructionCost Cost = 0; 1715 Cost += 1716 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1717 Cost += 1718 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1719 Cost += thisT()->getArithmeticInstrCost( 1720 BinaryOperator::Shl, RetTy, CostKind, OpInfoX, 1721 {OpInfoZ.Kind, TTI::OP_None}); 1722 Cost += thisT()->getArithmeticInstrCost( 1723 BinaryOperator::LShr, RetTy, CostKind, OpInfoY, 1724 {OpInfoZ.Kind, TTI::OP_None}); 1725 // Non-constant shift amounts requires a modulo. 1726 if (!OpInfoZ.isConstant()) 1727 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1728 CostKind, OpInfoZ, OpInfoBW); 1729 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1730 if (X != Y) { 1731 Type *CondTy = RetTy->getWithNewBitWidth(1); 1732 Cost += 1733 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1734 CmpInst::ICMP_EQ, CostKind); 1735 Cost += 1736 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1737 CmpInst::ICMP_EQ, CostKind); 1738 } 1739 return Cost; 1740 } 1741 case Intrinsic::get_active_lane_mask: { 1742 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1743 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1744 1745 // If we're not expanding the intrinsic then we assume this is cheap 1746 // to implement. 1747 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1748 return getTypeLegalizationCost(RetTy).first; 1749 } 1750 1751 // Create the expanded types that will be used to calculate the uadd_sat 1752 // operation. 1753 Type *ExpRetTy = VectorType::get( 1754 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1755 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1756 InstructionCost Cost = 1757 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1758 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1759 CmpInst::ICMP_ULT, CostKind); 1760 return Cost; 1761 } 1762 case Intrinsic::experimental_cttz_elts: { 1763 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1764 1765 // If we're not expanding the intrinsic then we assume this is cheap 1766 // to implement. 1767 if (!getTLI()->shouldExpandCttzElements(ArgType)) 1768 return getTypeLegalizationCost(RetTy).first; 1769 1770 // TODO: The costs below reflect the expansion code in 1771 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in 1772 // favour of compile time. 1773 1774 // Find the smallest "sensible" element type to use for the expansion. 1775 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero(); 1776 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64)); 1777 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller()) 1778 VScaleRange = getVScaleRange(I->getCaller(), 64); 1779 1780 unsigned EltWidth = getTLI()->getBitWidthForCttzElements( 1781 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange); 1782 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth); 1783 1784 // Create the new vector type & get the vector length 1785 Type *NewVecTy = VectorType::get( 1786 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount()); 1787 1788 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector, 1789 NewVecTy, {}, FMF); 1790 InstructionCost Cost = 1791 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind); 1792 1793 Cost += 1794 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind); 1795 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy, 1796 Args[0]->getType(), 1797 TTI::CastContextHint::None, CostKind); 1798 Cost += 1799 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind); 1800 1801 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax, 1802 NewEltTy, NewVecTy, FMF, I, 1); 1803 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind); 1804 Cost += 1805 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind); 1806 1807 return Cost; 1808 } 1809 } 1810 1811 // VP Intrinsics should have the same cost as their non-vp counterpart. 1812 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp 1813 // counterpart when the vector length argument is smaller than the maximum 1814 // vector length. 1815 // TODO: Support other kinds of VPIntrinsics 1816 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { 1817 std::optional<unsigned> FOp = 1818 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1819 if (FOp) { 1820 if (ICA.getID() == Intrinsic::vp_load) { 1821 Align Alignment; 1822 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1823 Alignment = VPI->getPointerAlignment().valueOrOne(); 1824 unsigned AS = 0; 1825 if (ICA.getArgs().size() > 1) 1826 if (auto *PtrTy = 1827 dyn_cast<PointerType>(ICA.getArgs()[0]->getType())) 1828 AS = PtrTy->getAddressSpace(); 1829 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, 1830 AS, CostKind); 1831 } 1832 if (ICA.getID() == Intrinsic::vp_store) { 1833 Align Alignment; 1834 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1835 Alignment = VPI->getPointerAlignment().valueOrOne(); 1836 unsigned AS = 0; 1837 if (ICA.getArgs().size() >= 2) 1838 if (auto *PtrTy = 1839 dyn_cast<PointerType>(ICA.getArgs()[1]->getType())) 1840 AS = PtrTy->getAddressSpace(); 1841 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, 1842 AS, CostKind); 1843 } 1844 if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { 1845 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), 1846 CostKind); 1847 } 1848 } 1849 1850 std::optional<Intrinsic::ID> FID = 1851 VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); 1852 if (FID) { 1853 // Non-vp version will have same Args/Tys except mask and vector length. 1854 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && 1855 "Expected VPIntrinsic to have Mask and Vector Length args and " 1856 "types"); 1857 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); 1858 1859 // VPReduction intrinsics have a start value argument that their non-vp 1860 // counterparts do not have, except for the fadd and fmul non-vp 1861 // counterpart. 1862 if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && 1863 *FID != Intrinsic::vector_reduce_fadd && 1864 *FID != Intrinsic::vector_reduce_fmul) 1865 NewTys = NewTys.drop_front(); 1866 1867 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, 1868 ICA.getFlags()); 1869 return thisT()->getIntrinsicInstrCost(NewICA, CostKind); 1870 } 1871 } 1872 1873 // Assume that we need to scalarize this intrinsic.) 1874 // Compute the scalarization overhead based on Args for a vector 1875 // intrinsic. 1876 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1877 if (RetVF.isVector() && !RetVF.isScalable()) { 1878 ScalarizationCost = 0; 1879 if (!RetTy->isVoidTy()) 1880 ScalarizationCost += getScalarizationOverhead( 1881 cast<VectorType>(RetTy), 1882 /*Insert*/ true, /*Extract*/ false, CostKind); 1883 ScalarizationCost += 1884 getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); 1885 } 1886 1887 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1888 ScalarizationCost); 1889 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1890 } 1891 1892 /// Get intrinsic cost based on argument types. 1893 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1894 /// cost of scalarizing the arguments and the return value will be computed 1895 /// based on types. 1896 InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1897 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1898 TTI::TargetCostKind CostKind) { 1899 Intrinsic::ID IID = ICA.getID(); 1900 Type *RetTy = ICA.getReturnType(); 1901 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1902 FastMathFlags FMF = ICA.getFlags(); 1903 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1904 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1905 1906 VectorType *VecOpTy = nullptr; 1907 if (!Tys.empty()) { 1908 // The vector reduction operand is operand 0 except for fadd/fmul. 1909 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1910 unsigned VecTyIndex = 0; 1911 if (IID == Intrinsic::vector_reduce_fadd || 1912 IID == Intrinsic::vector_reduce_fmul) 1913 VecTyIndex = 1; 1914 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1915 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1916 } 1917 1918 // Library call cost - other than size, make it expensive. 1919 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1920 unsigned ISD = 0; 1921 switch (IID) { 1922 default: { 1923 // Scalable vectors cannot be scalarized, so return Invalid. 1924 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1925 return isa<ScalableVectorType>(Ty); 1926 })) 1927 return InstructionCost::getInvalid(); 1928 1929 // Assume that we need to scalarize this intrinsic. 1930 InstructionCost ScalarizationCost = 1931 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1932 unsigned ScalarCalls = 1; 1933 Type *ScalarRetTy = RetTy; 1934 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1935 if (!SkipScalarizationCost) 1936 ScalarizationCost = getScalarizationOverhead( 1937 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); 1938 ScalarCalls = std::max(ScalarCalls, 1939 cast<FixedVectorType>(RetVTy)->getNumElements()); 1940 ScalarRetTy = RetTy->getScalarType(); 1941 } 1942 SmallVector<Type *, 4> ScalarTys; 1943 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1944 Type *Ty = Tys[i]; 1945 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1946 if (!SkipScalarizationCost) 1947 ScalarizationCost += getScalarizationOverhead( 1948 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 1949 ScalarCalls = std::max(ScalarCalls, 1950 cast<FixedVectorType>(VTy)->getNumElements()); 1951 Ty = Ty->getScalarType(); 1952 } 1953 ScalarTys.push_back(Ty); 1954 } 1955 if (ScalarCalls == 1) 1956 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1957 1958 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1959 InstructionCost ScalarCost = 1960 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1961 1962 return ScalarCalls * ScalarCost + ScalarizationCost; 1963 } 1964 // Look for intrinsics that can be lowered directly or turned into a scalar 1965 // intrinsic call. 1966 case Intrinsic::sqrt: 1967 ISD = ISD::FSQRT; 1968 break; 1969 case Intrinsic::sin: 1970 ISD = ISD::FSIN; 1971 break; 1972 case Intrinsic::cos: 1973 ISD = ISD::FCOS; 1974 break; 1975 case Intrinsic::exp: 1976 ISD = ISD::FEXP; 1977 break; 1978 case Intrinsic::exp2: 1979 ISD = ISD::FEXP2; 1980 break; 1981 case Intrinsic::exp10: 1982 ISD = ISD::FEXP10; 1983 break; 1984 case Intrinsic::log: 1985 ISD = ISD::FLOG; 1986 break; 1987 case Intrinsic::log10: 1988 ISD = ISD::FLOG10; 1989 break; 1990 case Intrinsic::log2: 1991 ISD = ISD::FLOG2; 1992 break; 1993 case Intrinsic::fabs: 1994 ISD = ISD::FABS; 1995 break; 1996 case Intrinsic::canonicalize: 1997 ISD = ISD::FCANONICALIZE; 1998 break; 1999 case Intrinsic::minnum: 2000 ISD = ISD::FMINNUM; 2001 break; 2002 case Intrinsic::maxnum: 2003 ISD = ISD::FMAXNUM; 2004 break; 2005 case Intrinsic::minimum: 2006 ISD = ISD::FMINIMUM; 2007 break; 2008 case Intrinsic::maximum: 2009 ISD = ISD::FMAXIMUM; 2010 break; 2011 case Intrinsic::copysign: 2012 ISD = ISD::FCOPYSIGN; 2013 break; 2014 case Intrinsic::floor: 2015 ISD = ISD::FFLOOR; 2016 break; 2017 case Intrinsic::ceil: 2018 ISD = ISD::FCEIL; 2019 break; 2020 case Intrinsic::trunc: 2021 ISD = ISD::FTRUNC; 2022 break; 2023 case Intrinsic::nearbyint: 2024 ISD = ISD::FNEARBYINT; 2025 break; 2026 case Intrinsic::rint: 2027 ISD = ISD::FRINT; 2028 break; 2029 case Intrinsic::lrint: 2030 ISD = ISD::LRINT; 2031 break; 2032 case Intrinsic::llrint: 2033 ISD = ISD::LLRINT; 2034 break; 2035 case Intrinsic::round: 2036 ISD = ISD::FROUND; 2037 break; 2038 case Intrinsic::roundeven: 2039 ISD = ISD::FROUNDEVEN; 2040 break; 2041 case Intrinsic::pow: 2042 ISD = ISD::FPOW; 2043 break; 2044 case Intrinsic::fma: 2045 ISD = ISD::FMA; 2046 break; 2047 case Intrinsic::fmuladd: 2048 ISD = ISD::FMA; 2049 break; 2050 case Intrinsic::experimental_constrained_fmuladd: 2051 ISD = ISD::STRICT_FMA; 2052 break; 2053 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 2054 case Intrinsic::lifetime_start: 2055 case Intrinsic::lifetime_end: 2056 case Intrinsic::sideeffect: 2057 case Intrinsic::pseudoprobe: 2058 case Intrinsic::arithmetic_fence: 2059 return 0; 2060 case Intrinsic::masked_store: { 2061 Type *Ty = Tys[0]; 2062 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2063 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 2064 CostKind); 2065 } 2066 case Intrinsic::masked_load: { 2067 Type *Ty = RetTy; 2068 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2069 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 2070 CostKind); 2071 } 2072 case Intrinsic::vector_reduce_add: 2073 case Intrinsic::vector_reduce_mul: 2074 case Intrinsic::vector_reduce_and: 2075 case Intrinsic::vector_reduce_or: 2076 case Intrinsic::vector_reduce_xor: 2077 return thisT()->getArithmeticReductionCost( 2078 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt, 2079 CostKind); 2080 case Intrinsic::vector_reduce_fadd: 2081 case Intrinsic::vector_reduce_fmul: 2082 return thisT()->getArithmeticReductionCost( 2083 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind); 2084 case Intrinsic::vector_reduce_smax: 2085 case Intrinsic::vector_reduce_smin: 2086 case Intrinsic::vector_reduce_umax: 2087 case Intrinsic::vector_reduce_umin: 2088 case Intrinsic::vector_reduce_fmax: 2089 case Intrinsic::vector_reduce_fmin: 2090 case Intrinsic::vector_reduce_fmaximum: 2091 case Intrinsic::vector_reduce_fminimum: 2092 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID), 2093 VecOpTy, ICA.getFlags(), CostKind); 2094 case Intrinsic::abs: { 2095 // abs(X) = select(icmp(X,0),X,sub(0,X)) 2096 Type *CondTy = RetTy->getWithNewBitWidth(1); 2097 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2098 InstructionCost Cost = 0; 2099 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2100 Pred, CostKind); 2101 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2102 Pred, CostKind); 2103 // TODO: Should we add an OperandValueProperties::OP_Zero property? 2104 Cost += thisT()->getArithmeticInstrCost( 2105 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); 2106 return Cost; 2107 } 2108 case Intrinsic::smax: 2109 case Intrinsic::smin: 2110 case Intrinsic::umax: 2111 case Intrinsic::umin: { 2112 // minmax(X,Y) = select(icmp(X,Y),X,Y) 2113 Type *CondTy = RetTy->getWithNewBitWidth(1); 2114 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 2115 CmpInst::Predicate Pred = 2116 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 2117 InstructionCost Cost = 0; 2118 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2119 Pred, CostKind); 2120 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2121 Pred, CostKind); 2122 return Cost; 2123 } 2124 case Intrinsic::sadd_sat: 2125 case Intrinsic::ssub_sat: { 2126 Type *CondTy = RetTy->getWithNewBitWidth(1); 2127 2128 Type *OpTy = StructType::create({RetTy, CondTy}); 2129 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 2130 ? Intrinsic::sadd_with_overflow 2131 : Intrinsic::ssub_with_overflow; 2132 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2133 2134 // SatMax -> Overflow && SumDiff < 0 2135 // SatMin -> Overflow && SumDiff >= 0 2136 InstructionCost Cost = 0; 2137 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2138 nullptr, ScalarizationCostPassed); 2139 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2140 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2141 Pred, CostKind); 2142 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 2143 CondTy, Pred, CostKind); 2144 return Cost; 2145 } 2146 case Intrinsic::uadd_sat: 2147 case Intrinsic::usub_sat: { 2148 Type *CondTy = RetTy->getWithNewBitWidth(1); 2149 2150 Type *OpTy = StructType::create({RetTy, CondTy}); 2151 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 2152 ? Intrinsic::uadd_with_overflow 2153 : Intrinsic::usub_with_overflow; 2154 2155 InstructionCost Cost = 0; 2156 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2157 nullptr, ScalarizationCostPassed); 2158 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2159 Cost += 2160 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2161 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2162 return Cost; 2163 } 2164 case Intrinsic::smul_fix: 2165 case Intrinsic::umul_fix: { 2166 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 2167 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 2168 2169 unsigned ExtOp = 2170 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 2171 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2172 2173 InstructionCost Cost = 0; 2174 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 2175 Cost += 2176 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2177 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 2178 CCH, CostKind); 2179 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 2180 CostKind, 2181 {TTI::OK_AnyValue, TTI::OP_None}, 2182 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2183 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 2184 {TTI::OK_AnyValue, TTI::OP_None}, 2185 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2186 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 2187 return Cost; 2188 } 2189 case Intrinsic::sadd_with_overflow: 2190 case Intrinsic::ssub_with_overflow: { 2191 Type *SumTy = RetTy->getContainedType(0); 2192 Type *OverflowTy = RetTy->getContainedType(1); 2193 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 2194 ? BinaryOperator::Add 2195 : BinaryOperator::Sub; 2196 2197 // Add: 2198 // Overflow -> (Result < LHS) ^ (RHS < 0) 2199 // Sub: 2200 // Overflow -> (Result < LHS) ^ (RHS > 0) 2201 InstructionCost Cost = 0; 2202 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2203 Cost += 2 * thisT()->getCmpSelInstrCost( 2204 Instruction::ICmp, SumTy, OverflowTy, 2205 CmpInst::ICMP_SGT, CostKind); 2206 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 2207 CostKind); 2208 return Cost; 2209 } 2210 case Intrinsic::uadd_with_overflow: 2211 case Intrinsic::usub_with_overflow: { 2212 Type *SumTy = RetTy->getContainedType(0); 2213 Type *OverflowTy = RetTy->getContainedType(1); 2214 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 2215 ? BinaryOperator::Add 2216 : BinaryOperator::Sub; 2217 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 2218 ? CmpInst::ICMP_ULT 2219 : CmpInst::ICMP_UGT; 2220 2221 InstructionCost Cost = 0; 2222 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2223 Cost += 2224 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 2225 Pred, CostKind); 2226 return Cost; 2227 } 2228 case Intrinsic::smul_with_overflow: 2229 case Intrinsic::umul_with_overflow: { 2230 Type *MulTy = RetTy->getContainedType(0); 2231 Type *OverflowTy = RetTy->getContainedType(1); 2232 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 2233 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 2234 bool IsSigned = IID == Intrinsic::smul_with_overflow; 2235 2236 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 2237 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2238 2239 InstructionCost Cost = 0; 2240 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 2241 Cost += 2242 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2243 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 2244 CCH, CostKind); 2245 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 2246 CostKind, 2247 {TTI::OK_AnyValue, TTI::OP_None}, 2248 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2249 2250 if (IsSigned) 2251 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 2252 CostKind, 2253 {TTI::OK_AnyValue, TTI::OP_None}, 2254 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2255 2256 Cost += thisT()->getCmpSelInstrCost( 2257 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 2258 return Cost; 2259 } 2260 case Intrinsic::fptosi_sat: 2261 case Intrinsic::fptoui_sat: { 2262 if (Tys.empty()) 2263 break; 2264 Type *FromTy = Tys[0]; 2265 bool IsSigned = IID == Intrinsic::fptosi_sat; 2266 2267 InstructionCost Cost = 0; 2268 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 2269 {FromTy, FromTy}); 2270 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 2271 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 2272 {FromTy, FromTy}); 2273 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 2274 Cost += thisT()->getCastInstrCost( 2275 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 2276 TTI::CastContextHint::None, CostKind); 2277 if (IsSigned) { 2278 Type *CondTy = RetTy->getWithNewBitWidth(1); 2279 Cost += thisT()->getCmpSelInstrCost( 2280 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2281 Cost += thisT()->getCmpSelInstrCost( 2282 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2283 } 2284 return Cost; 2285 } 2286 case Intrinsic::ctpop: 2287 ISD = ISD::CTPOP; 2288 // In case of legalization use TCC_Expensive. This is cheaper than a 2289 // library call but still not a cheap instruction. 2290 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2291 break; 2292 case Intrinsic::ctlz: 2293 ISD = ISD::CTLZ; 2294 break; 2295 case Intrinsic::cttz: 2296 ISD = ISD::CTTZ; 2297 break; 2298 case Intrinsic::bswap: 2299 ISD = ISD::BSWAP; 2300 break; 2301 case Intrinsic::bitreverse: 2302 ISD = ISD::BITREVERSE; 2303 break; 2304 } 2305 2306 const TargetLoweringBase *TLI = getTLI(); 2307 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 2308 2309 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2310 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2311 TLI->isFAbsFree(LT.second)) { 2312 return 0; 2313 } 2314 2315 // The operation is legal. Assume it costs 1. 2316 // If the type is split to multiple registers, assume that there is some 2317 // overhead to this. 2318 // TODO: Once we have extract/insert subvector cost we need to use them. 2319 if (LT.first > 1) 2320 return (LT.first * 2); 2321 else 2322 return (LT.first * 1); 2323 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2324 // If the operation is custom lowered then assume 2325 // that the code is twice as expensive. 2326 return (LT.first * 2); 2327 } 2328 2329 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2330 // point mul followed by an add. 2331 if (IID == Intrinsic::fmuladd) 2332 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2333 CostKind) + 2334 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2335 CostKind); 2336 if (IID == Intrinsic::experimental_constrained_fmuladd) { 2337 IntrinsicCostAttributes FMulAttrs( 2338 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2339 IntrinsicCostAttributes FAddAttrs( 2340 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2341 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2342 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2343 } 2344 2345 // Else, assume that we need to scalarize this intrinsic. For math builtins 2346 // this will emit a costly libcall, adding call overhead and spills. Make it 2347 // very expensive. 2348 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2349 // Scalable vectors cannot be scalarized, so return Invalid. 2350 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2351 return isa<ScalableVectorType>(Ty); 2352 })) 2353 return InstructionCost::getInvalid(); 2354 2355 InstructionCost ScalarizationCost = 2356 SkipScalarizationCost 2357 ? ScalarizationCostPassed 2358 : getScalarizationOverhead(RetVTy, /*Insert*/ true, 2359 /*Extract*/ false, CostKind); 2360 2361 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2362 SmallVector<Type *, 4> ScalarTys; 2363 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2364 Type *Ty = Tys[i]; 2365 if (Ty->isVectorTy()) 2366 Ty = Ty->getScalarType(); 2367 ScalarTys.push_back(Ty); 2368 } 2369 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2370 InstructionCost ScalarCost = 2371 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2372 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2373 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 2374 if (!ICA.skipScalarizationCost()) 2375 ScalarizationCost += getScalarizationOverhead( 2376 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2377 ScalarCalls = std::max(ScalarCalls, 2378 cast<FixedVectorType>(VTy)->getNumElements()); 2379 } 2380 } 2381 return ScalarCalls * ScalarCost + ScalarizationCost; 2382 } 2383 2384 // This is going to be turned into a library call, make it expensive. 2385 return SingleCallCost; 2386 } 2387 2388 /// Compute a cost of the given call instruction. 2389 /// 2390 /// Compute the cost of calling function F with return type RetTy and 2391 /// argument types Tys. F might be nullptr, in this case the cost of an 2392 /// arbitrary call with the specified signature will be returned. 2393 /// This is used, for instance, when we estimate call of a vector 2394 /// counterpart of the given function. 2395 /// \param F Called function, might be nullptr. 2396 /// \param RetTy Return value types. 2397 /// \param Tys Argument types. 2398 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2399 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2400 ArrayRef<Type *> Tys, 2401 TTI::TargetCostKind CostKind) { 2402 return 10; 2403 } 2404 getNumberOfParts(Type * Tp)2405 unsigned getNumberOfParts(Type *Tp) { 2406 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 2407 return LT.first.isValid() ? *LT.first.getValue() : 0; 2408 } 2409 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)2410 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2411 const SCEV *) { 2412 return 0; 2413 } 2414 2415 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2416 /// We're assuming that reduction operation are performing the following way: 2417 /// 2418 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2419 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2420 /// \----------------v-------------/ \----------v------------/ 2421 /// n/2 elements n/2 elements 2422 /// %red1 = op <n x t> %val, <n x t> val1 2423 /// After this operation we have a vector %red1 where only the first n/2 2424 /// elements are meaningful, the second n/2 elements are undefined and can be 2425 /// dropped. All other operations are actually working with the vector of 2426 /// length n/2, not n, though the real vector length is still n. 2427 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2428 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2429 /// \----------------v-------------/ \----------v------------/ 2430 /// n/4 elements 3*n/4 elements 2431 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2432 /// length n/2, the resulting vector has length n/4 etc. 2433 /// 2434 /// The cost model should take into account that the actual length of the 2435 /// vector is reduced on each iteration. getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2436 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2437 TTI::TargetCostKind CostKind) { 2438 // Targets must implement a default value for the scalable case, since 2439 // we don't know how many lanes the vector has. 2440 if (isa<ScalableVectorType>(Ty)) 2441 return InstructionCost::getInvalid(); 2442 2443 Type *ScalarTy = Ty->getElementType(); 2444 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2445 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2446 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2447 NumVecElts >= 2) { 2448 // Or reduction for i1 is represented as: 2449 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2450 // %res = cmp ne iReduxWidth %val, 0 2451 // And reduction for i1 is represented as: 2452 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2453 // %res = cmp eq iReduxWidth %val, 11111 2454 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2455 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2456 TTI::CastContextHint::None, CostKind) + 2457 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2458 CmpInst::makeCmpResultType(ValTy), 2459 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2460 } 2461 unsigned NumReduxLevels = Log2_32(NumVecElts); 2462 InstructionCost ArithCost = 0; 2463 InstructionCost ShuffleCost = 0; 2464 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2465 unsigned LongVectorCount = 0; 2466 unsigned MVTLen = 2467 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2468 while (NumVecElts > MVTLen) { 2469 NumVecElts /= 2; 2470 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2471 ShuffleCost += 2472 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2473 CostKind, NumVecElts, SubTy); 2474 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2475 Ty = SubTy; 2476 ++LongVectorCount; 2477 } 2478 2479 NumReduxLevels -= LongVectorCount; 2480 2481 // The minimal length of the vector is limited by the real length of vector 2482 // operations performed on the current platform. That's why several final 2483 // reduction operations are performed on the vectors with the same 2484 // architecture-dependent length. 2485 2486 // By default reductions need one shuffle per reduction level. 2487 ShuffleCost += 2488 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2489 std::nullopt, CostKind, 0, Ty); 2490 ArithCost += 2491 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2492 return ShuffleCost + ArithCost + 2493 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2494 CostKind, 0, nullptr, nullptr); 2495 } 2496 2497 /// Try to calculate the cost of performing strict (in-order) reductions, 2498 /// which involves doing a sequence of floating point additions in lane 2499 /// order, starting with an initial value. For example, consider a scalar 2500 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2501 /// 2502 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2503 /// 2504 /// %add1 = %InitVal + %v0 2505 /// %add2 = %add1 + %v1 2506 /// %add3 = %add2 + %v2 2507 /// %add4 = %add3 + %v3 2508 /// 2509 /// As a simple estimate we can say the cost of such a reduction is 4 times 2510 /// the cost of a scalar FP addition. We can only estimate the costs for 2511 /// fixed-width vectors here because for scalable vectors we do not know the 2512 /// runtime number of operations. getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2513 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2514 TTI::TargetCostKind CostKind) { 2515 // Targets must implement a default value for the scalable case, since 2516 // we don't know how many lanes the vector has. 2517 if (isa<ScalableVectorType>(Ty)) 2518 return InstructionCost::getInvalid(); 2519 2520 auto *VTy = cast<FixedVectorType>(Ty); 2521 InstructionCost ExtractCost = getScalarizationOverhead( 2522 VTy, /*Insert=*/false, /*Extract=*/true, CostKind); 2523 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2524 Opcode, VTy->getElementType(), CostKind); 2525 ArithCost *= VTy->getNumElements(); 2526 2527 return ExtractCost + ArithCost; 2528 } 2529 getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2530 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2531 std::optional<FastMathFlags> FMF, 2532 TTI::TargetCostKind CostKind) { 2533 assert(Ty && "Unknown reduction vector type"); 2534 if (TTI::requiresOrderedReduction(FMF)) 2535 return getOrderedReductionCost(Opcode, Ty, CostKind); 2536 return getTreeReductionCost(Opcode, Ty, CostKind); 2537 } 2538 2539 /// Try to calculate op costs for min/max reduction operations. 2540 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2541 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 2542 FastMathFlags FMF, 2543 TTI::TargetCostKind CostKind) { 2544 // Targets must implement a default value for the scalable case, since 2545 // we don't know how many lanes the vector has. 2546 if (isa<ScalableVectorType>(Ty)) 2547 return InstructionCost::getInvalid(); 2548 2549 Type *ScalarTy = Ty->getElementType(); 2550 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2551 unsigned NumReduxLevels = Log2_32(NumVecElts); 2552 InstructionCost MinMaxCost = 0; 2553 InstructionCost ShuffleCost = 0; 2554 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2555 unsigned LongVectorCount = 0; 2556 unsigned MVTLen = 2557 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2558 while (NumVecElts > MVTLen) { 2559 NumVecElts /= 2; 2560 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2561 2562 ShuffleCost += 2563 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2564 CostKind, NumVecElts, SubTy); 2565 2566 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); 2567 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); 2568 Ty = SubTy; 2569 ++LongVectorCount; 2570 } 2571 2572 NumReduxLevels -= LongVectorCount; 2573 2574 // The minimal length of the vector is limited by the real length of vector 2575 // operations performed on the current platform. That's why several final 2576 // reduction opertions are perfomed on the vectors with the same 2577 // architecture-dependent length. 2578 ShuffleCost += 2579 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2580 std::nullopt, CostKind, 0, Ty); 2581 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); 2582 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); 2583 // The last min/max should be in vector registers and we counted it above. 2584 // So just need a single extractelement. 2585 return ShuffleCost + MinMaxCost + 2586 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2587 CostKind, 0, nullptr, nullptr); 2588 } 2589 getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2590 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, 2591 Type *ResTy, VectorType *Ty, 2592 FastMathFlags FMF, 2593 TTI::TargetCostKind CostKind) { 2594 // Without any native support, this is equivalent to the cost of 2595 // vecreduce.opcode(ext(Ty A)). 2596 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2597 InstructionCost RedCost = 2598 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); 2599 InstructionCost ExtCost = thisT()->getCastInstrCost( 2600 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2601 TTI::CastContextHint::None, CostKind); 2602 2603 return RedCost + ExtCost; 2604 } 2605 getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2606 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 2607 VectorType *Ty, 2608 TTI::TargetCostKind CostKind) { 2609 // Without any native support, this is equivalent to the cost of 2610 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or 2611 // vecreduce.add(mul(A, B)). 2612 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2613 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2614 Instruction::Add, ExtTy, std::nullopt, CostKind); 2615 InstructionCost ExtCost = thisT()->getCastInstrCost( 2616 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2617 TTI::CastContextHint::None, CostKind); 2618 2619 InstructionCost MulCost = 2620 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2621 2622 return RedCost + MulCost + 2 * ExtCost; 2623 } 2624 getVectorSplitCost()2625 InstructionCost getVectorSplitCost() { return 1; } 2626 2627 /// @} 2628 }; 2629 2630 /// Concrete BasicTTIImpl that can be used if no further customization 2631 /// is needed. 2632 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2633 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2634 2635 friend class BasicTTIImplBase<BasicTTIImpl>; 2636 2637 const TargetSubtargetInfo *ST; 2638 const TargetLoweringBase *TLI; 2639 getST()2640 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2641 const TargetLoweringBase *getTLI() const { return TLI; } 2642 2643 public: 2644 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2645 }; 2646 2647 } // end namespace llvm 2648 2649 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2650