diff options
author | Mark Kettenis <kettenis@cvs.openbsd.org> | 2017-02-25 16:53:03 +0000 |
---|---|---|
committer | Mark Kettenis <kettenis@cvs.openbsd.org> | 2017-02-25 16:53:03 +0000 |
commit | 2b5bdee33543957e19ded870b22ddcc8086c432f (patch) | |
tree | a554f7d0a0a883f62f27d141d1210699b3722e85 /gnu/llvm | |
parent | 6afdc1cea26a7ac3918106c7162a0323c711e4ad (diff) |
Disable loop idiom recognition for _libc_memset and _libc_memcpy. These are
the internal names we use in libc for memset and memcpy and having the
compiler optimize them as calls to memset and memcpy will lead to infinite
recursion.
This is a temporary solution while guenther@ tries to figure out a better
way to force calls from inside libc via identifiers that are of hidden
visibility.
ok jsg@, patrick@
Diffstat (limited to 'gnu/llvm')
-rw-r--r-- | gnu/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 466 |
1 files changed, 335 insertions, 131 deletions
diff --git a/gnu/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/gnu/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 4521640e394..991a8f9115e 100644 --- a/gnu/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/gnu/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -11,6 +11,12 @@ // non-loop form. In cases that this kicks in, it can be a significant // performance win. // +// If compiling for code size we avoid idiom recognition if the resulting +// code could be larger than the code for the original loop. One way this could +// happen is if the loop is not removable after idiom recognition due to the +// presence of non-idiom instructions. The initial implementation of the +// heuristics applies to idioms in multi-block loops. +// //===----------------------------------------------------------------------===// // // TODO List: @@ -26,21 +32,19 @@ // i64 and larger types when i64 is legal and the value has few bits set. It // would be good to enhance isel to emit a loop for ctpop in this case. // -// We should enhance the memset/memcpy recognition to handle multiple stores in -// the loop. This would handle things like: -// void foo(_Complex float *P) -// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -55,7 +59,11 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-idiom" @@ -63,9 +71,15 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +static cl::opt<bool> UseLIRCodeSizeHeurs( + "use-lir-code-size-heurs", + cl::desc("Use loop idiom recognition code size heuristics when compiling" + "with -Os/-Oz"), + cl::init(true), cl::Hidden); + namespace { -class LoopIdiomRecognize : public LoopPass { +class LoopIdiomRecognize { Loop *CurLoop; AliasAnalysis *AA; DominatorTree *DT; @@ -74,41 +88,24 @@ class LoopIdiomRecognize : public LoopPass { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; + bool ApplyCodeSizeHeuristics; public: - static char ID; - explicit LoopIdiomRecognize() : LoopPass(ID) { - initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, + TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), + DL(DL) {} - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } + bool runOnLoop(Loop *L); private: typedef SmallVector<StoreInst *, 8> StoreList; - StoreList StoreRefsForMemset; + typedef MapVector<Value *, StoreList> StoreListMap; + StoreListMap StoreRefsForMemset; + StoreListMap StoreRefsForMemsetPattern; StoreList StoreRefsForMemcpy; bool HasMemset; bool HasMemsetPattern; @@ -122,15 +119,21 @@ private: SmallVectorImpl<BasicBlock *> &ExitBlocks); void collectStores(BasicBlock *BB); - bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy); - bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern, + bool &ForMemcpy); + bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, + bool ForMemset); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride); + Instruction *TheStore, + SmallPtrSetImpl<Instruction *> &Stores, + const SCEVAddRecExpr *Ev, const SCEV *BECount, + bool NegStride, bool IsLoopMemset = false); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); + bool avoidLIRForMultiBlockLoop(bool IsMemset = false, + bool IsLoopMemset = false); /// @} /// \name Noncountable Loop Idiom Handling @@ -145,38 +148,70 @@ private: /// @} }; +class LoopIdiomRecognizeLegacyPass : public LoopPass { +public: + static char ID; + explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { + initializeLoopIdiomRecognizeLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); + const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); + + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL); + return LIR.runOnLoop(L); + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + getLoopAnalysisUsage(AU); + } +}; } // End anonymous namespace. -char LoopIdiomRecognize::ID = 0; -INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", - false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL); + if (!LIR.runOnLoop(&L)) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +char LoopIdiomRecognizeLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", + "Recognize loop idioms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", - false, false) +INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", + "Recognize loop idioms", false, false) -Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } +Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } -/// deleteDeadInstruction - Delete this instruction. Before we do, go through -/// and zero out all the operands of this instruction. If any of them become -/// dead, delete them and the computation tree that feeds them. -/// -static void deleteDeadInstruction(Instruction *I, - const TargetLibraryInfo *TLI) { - SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); +static void deleteDeadInstruction(Instruction *I) { I->replaceAllUsesWith(UndefValue::get(I->getType())); I->eraseFromParent(); - for (Value *Op : Operands) - RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); } //===----------------------------------------------------------------------===// @@ -185,10 +220,7 @@ static void deleteDeadInstruction(Instruction *I, // //===----------------------------------------------------------------------===// -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - +bool LoopIdiomRecognize::runOnLoop(Loop *L) { CurLoop = L; // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. @@ -199,15 +231,12 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { StringRef Name = L->getHeader()->getParent()->getName(); if (Name == "memset" || Name == "memcpy") return false; + if (Name == "_libc_memset" || Name == "_libc_memcpy") + return false; - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurLoop->getHeader()->getParent()); - DL = &CurLoop->getHeader()->getModule()->getDataLayout(); + // Determine if code size heuristics need to be applied. + ApplyCodeSizeHeuristics = + L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs; HasMemset = TLI->has(LibFunc::memset); HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); @@ -240,6 +269,14 @@ bool LoopIdiomRecognize::runOnCountableLoop() { << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; + + // The following transforms hoist stores/memsets into the loop pre-header. + // Give up if the loop has instructions may throw. + LoopSafetyInfo SafetyInfo; + computeLoopSafetyInfo(&SafetyInfo, CurLoop); + if (SafetyInfo.MayThrow) + return MadeChange; + // Scan all the blocks in the loop that are not in subloops. for (auto *BB : CurLoop->getBlocks()) { // Ignore blocks in subloops. @@ -258,9 +295,9 @@ static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { return (unsigned)SizeInBits >> 3; } -static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { +static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); - return ConstStride->getAPInt().getZExtValue(); + return ConstStride->getAPInt(); } /// getMemSetPatternValue - If a strided store of the specified value is safe to @@ -305,11 +342,15 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { } bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, - bool &ForMemcpy) { + bool &ForMemsetPattern, bool &ForMemcpy) { // Don't touch volatile stores. if (!SI->isSimple()) return false; + // Avoid merging nontemporal stores. + if (SI->getMetadata(LLVMContext::MD_nontemporal)) + return false; + Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -353,7 +394,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, StorePtr->getType()->getPointerAddressSpace() == 0 && (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // It looks like we can use PatternValue! - ForMemset = true; + ForMemsetPattern = true; return true; } @@ -361,7 +402,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, if (HasMemcpy) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. - unsigned Stride = getStoreStride(StoreEv); + APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); if (StoreSize != Stride && StoreSize != -Stride) return false; @@ -393,6 +434,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, void LoopIdiomRecognize::collectStores(BasicBlock *BB) { StoreRefsForMemset.clear(); + StoreRefsForMemsetPattern.clear(); StoreRefsForMemcpy.clear(); for (Instruction &I : *BB) { StoreInst *SI = dyn_cast<StoreInst>(&I); @@ -400,15 +442,22 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { continue; bool ForMemset = false; + bool ForMemsetPattern = false; bool ForMemcpy = false; // Make sure this is a strided store with a constant stride. - if (!isLegalStore(SI, ForMemset, ForMemcpy)) + if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy)) continue; // Save the store locations. - if (ForMemset) - StoreRefsForMemset.push_back(SI); - else if (ForMemcpy) + if (ForMemset) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemset[Ptr].push_back(SI); + } else if (ForMemsetPattern) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemsetPattern[Ptr].push_back(SI); + } else if (ForMemcpy) StoreRefsForMemcpy.push_back(SI); } } @@ -430,9 +479,14 @@ bool LoopIdiomRecognize::runOnLoopBlock( // Look for store instructions, which may be optimized to memset/memcpy. collectStores(BB); - // Look for a single store which can be optimized into a memset. - for (auto &SI : StoreRefsForMemset) - MadeChange |= processLoopStore(SI, BECount); + // Look for a single store or sets of stores with a common base, which can be + // optimized into a memset (memset_pattern). The latter most commonly happens + // with structs and handunrolled loops. + for (auto &SL : StoreRefsForMemset) + MadeChange |= processLoopStores(SL.second, BECount, true); + + for (auto &SL : StoreRefsForMemsetPattern) + MadeChange |= processLoopStores(SL.second, BECount, false); // Optimize the store into a memcpy, if it feeds an similarly strided load. for (auto &SI : StoreRefsForMemcpy) @@ -458,26 +512,144 @@ bool LoopIdiomRecognize::runOnLoopBlock( return MadeChange; } -/// processLoopStore - See if this store can be promoted to a memset. -bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - assert(SI->isSimple() && "Expected only non-volatile stores."); +/// processLoopStores - See if this store(s) can be promoted to a memset. +bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, + const SCEV *BECount, + bool ForMemset) { + // Try to find consecutive stores that can be transformed into memsets. + SetVector<StoreInst *> Heads, Tails; + SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + SmallVector<unsigned, 16> IndexQueue; + for (unsigned i = 0, e = SL.size(); i < e; ++i) { + assert(SL[i]->isSimple() && "Expected only non-volatile stores."); + + Value *FirstStoredVal = SL[i]->getValueOperand(); + Value *FirstStorePtr = SL[i]->getPointerOperand(); + const SCEVAddRecExpr *FirstStoreEv = + cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); + APInt FirstStride = getStoreStride(FirstStoreEv); + unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); + + // See if we can optimize just this store in isolation. + if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { + Heads.insert(SL[i]); + continue; + } - Value *StoredVal = SI->getValueOperand(); - Value *StorePtr = SI->getPointerOperand(); + Value *FirstSplatValue = nullptr; + Constant *FirstPatternValue = nullptr; - // Check to see if the stride matches the size of the store. If so, then we - // know that every byte is touched in the loop. - const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - unsigned Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); - if (StoreSize != Stride && StoreSize != -Stride) - return false; + if (ForMemset) + FirstSplatValue = isBytewiseValue(FirstStoredVal); + else + FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); + + assert((FirstSplatValue || FirstPatternValue) && + "Expected either splat value or pattern value."); + + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find memset opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + assert(SL[k]->isSimple() && "Expected only non-volatile stores."); + Value *SecondStorePtr = SL[k]->getPointerOperand(); + const SCEVAddRecExpr *SecondStoreEv = + cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); + APInt SecondStride = getStoreStride(SecondStoreEv); + + if (FirstStride != SecondStride) + continue; - bool NegStride = StoreSize == -Stride; + Value *SecondStoredVal = SL[k]->getValueOperand(); + Value *SecondSplatValue = nullptr; + Constant *SecondPatternValue = nullptr; + + if (ForMemset) + SecondSplatValue = isBytewiseValue(SecondStoredVal); + else + SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); + + assert((SecondSplatValue || SecondPatternValue) && + "Expected either splat value or pattern value."); + + if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { + if (ForMemset) { + if (FirstSplatValue != SecondSplatValue) + continue; + } else { + if (FirstPatternValue != SecondPatternValue) + continue; + } + Tails.insert(SL[k]); + Heads.insert(SL[i]); + ConsecutiveChain[SL[i]] = SL[k]; + break; + } + } + } + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we transformed so that we don't visit the same store twice. + SmallPtrSet<Value *, 16> TransformedStores; + bool Changed = false; + + // For stores that start but don't end a link in the chain: + for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to transform it. + SmallPtrSet<Instruction *, 8> AdjacentStores; + StoreInst *I = *it; + + StoreInst *HeadStore = I; + unsigned StoreSize = 0; + + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (TransformedStores.count(I)) + break; + AdjacentStores.insert(I); - // See if we can optimize just this store in isolation. - return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount, NegStride); + StoreSize += getStoreSizeInBytes(I, DL); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + Value *StoredVal = HeadStore->getValueOperand(); + Value *StorePtr = HeadStore->getPointerOperand(); + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + APInt Stride = getStoreStride(StoreEv); + + // Check to see if the stride matches the size of the stores. If so, then + // we know that every byte is touched in the loop. + if (StoreSize != Stride && StoreSize != -Stride) + continue; + + bool NegStride = StoreSize == -Stride; + + if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + StoredVal, HeadStore, AdjacentStores, StoreEv, + BECount, NegStride)) { + TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); + Changed = true; + } + } + + return Changed; } /// processLoopMemSet - See if this memset can be promoted to a large memset. @@ -488,7 +660,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, return false; // If we're not allowed to hack on memset, we fail. - if (!TLI->has(LibFunc::memset)) + if (!HasMemset) return false; Value *Pointer = MSI->getDest(); @@ -507,11 +679,12 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, // Check to see if the stride matches the size of the memset. If so, then we // know that every byte is touched in the loop. - const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + if (!ConstStride) + return false; - // TODO: Could also handle negative stride here someday, that will require the - // validity check in mayLoopAccessLocation to be updated though. - if (!Stride || MSI->getLength() != Stride->getValue()) + APInt Stride = ConstStride->getAPInt(); + if (SizeInBytes != Stride && SizeInBytes != -Stride) return false; // Verify that the memset value is loop invariant. If not, we can't promote @@ -520,18 +693,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) return false; + SmallPtrSet<Instruction *, 1> MSIs; + MSIs.insert(MSI); + bool NegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), SplatValue, MSI, Ev, - BECount, /*NegStride=*/false); + MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, + BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, - AliasAnalysis &AA, - Instruction *IgnoredStore) { +static bool +mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, + SmallPtrSetImpl<Instruction *> &IgnoredStores) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. @@ -550,8 +727,9 @@ static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) - for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) + for (Instruction &I : **BI) + if (IgnoredStores.count(&I) == 0 && + (AA.getModRefInfo(&I, StoreLoc) & Access)) return true; return false; @@ -574,8 +752,9 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, - Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride) { + Value *StoredVal, Instruction *TheStore, + SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride, bool IsLoopMemset) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; @@ -609,13 +788,16 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *BasePtr = Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, - *AA, TheStore)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); return false; } + if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) + return false; + // Okay, everything looks good, insert the memset. // The # stored bytes is (BECount+1)*Size. Expand the trip count out to @@ -644,13 +826,14 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); + inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); - GV->setUnnamedAddr(true); // Ok to merge these. + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. GV->setAlignment(16); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); @@ -662,7 +845,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, TLI); + for (auto *I : Stores) + deleteDeadInstruction(I); ++NumMemSet; return true; } @@ -676,7 +860,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StorePtr = SI->getPointerOperand(); const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - unsigned Stride = getStoreStride(StoreEv); + APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); bool NegStride = StoreSize == -Stride; @@ -714,8 +898,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + SmallPtrSet<Instruction *, 1> Stores; + Stores.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, SI)) { + StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); @@ -735,7 +921,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, SI)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -743,6 +929,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, return false; } + if (avoidLIRForMultiBlockLoop()) + return false; + // Okay, everything is safe, we can transform this! // The # stored bytes is (BECount+1)*Size. Expand the trip count out to @@ -769,11 +958,28 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, TLI); + deleteDeadInstruction(SI); ++NumMemCpy; return true; } +// When compiling for codesize we avoid idiom recognition for a multi-block loop +// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. +// +bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, + bool IsLoopMemset) { + if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { + if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { + DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() + << " : LIR " << (IsMemset ? "Memset" : "Memcpy") + << " avoided: multi-block top-level loop\n"); + return true; + } + } + + return false; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { return recognizePopcount(); } @@ -781,7 +987,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { /// Check if the given conditional branch is based on the comparison between /// a variable and zero, and if the variable is non-zero, the control yields to /// the loop entry. If the branch matches the behavior, the variable involved -/// in the comparion is returned. This function will be called to see if the +/// in the comparison is returned. This function will be called to see if the /// precondition and postcondition of the loop are in desirable form. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { if (!BI || !BI->isConditional()) @@ -965,9 +1171,7 @@ bool LoopIdiomRecognize::recognizePopcount() { // It should have a preheader containing nothing but an unconditional branch. BasicBlock *PH = CurLoop->getLoopPreheader(); - if (!PH) - return false; - if (&PH->front() != PH->getTerminator()) + if (!PH || &PH->front() != PH->getTerminator()) return false; auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); if (!EntryBI || EntryBI->isConditional()) @@ -993,7 +1197,7 @@ bool LoopIdiomRecognize::recognizePopcount() { } static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, - DebugLoc DL) { + const DebugLoc &DL) { Value *Ops[] = {Val}; Type *Tys[] = {Val->getType()}; |