diff options
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 266 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 1134 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h | 27 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td | 602 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp | 123 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h | 52 | ||||
-rw-r--r-- | gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp | 3 | ||||
-rw-r--r-- | gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile | 3 |
8 files changed, 1590 insertions, 620 deletions
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index ada995bad37..3fbd9892e30 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1,4 +1,4 @@ -//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===// +//===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===// // // The LLVM Compiler Infrastructure // @@ -12,34 +12,47 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64.h" #include "AArch64MCInstLower.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" +#include "AArch64TargetObjectFile.h" #include "InstPrinter/AArch64InstPrinter.h" -#include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" -#include "llvm/MC/MCLinkerOptimizationHint.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <map> +#include <memory> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -49,15 +62,14 @@ namespace { class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; + const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), - SM(*this), AArch64FI(nullptr) {} + SM(*this) {} - const char *getPassName() const override { - return "AArch64 Assembly Printer"; - } + StringRef getPassName() const override { return "AArch64 Assembly Printer"; } /// \brief Wrapper for MCInstLowering.lowerOperand() for the /// tblgen'erated pseudo lowering. @@ -69,6 +81,13 @@ public: const MachineInstr &MI); void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); + + void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI); + void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); + void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); + + void EmitSled(const MachineInstr &MI, SledKind Kind); + /// \brief tblgen'erated driver function for lowering simple MI->MC /// pseudo instructions. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, @@ -83,11 +102,13 @@ public: bool runOnMachineFunction(MachineFunction &F) override { AArch64FI = F.getInfo<AArch64FunctionInfo>(); - return AsmPrinter::runOnMachineFunction(F); + STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget()); + bool Result = AsmPrinter::runOnMachineFunction(F); + emitXRayTable(); + return Result; } private: - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); bool printAsmRegInClass(const MachineOperand &MO, @@ -107,18 +128,76 @@ private: MCSymbol *GetCPISymbol(unsigned CPID) const override; void EmitEndOfAsmFile(Module &M) override; - AArch64FunctionInfo *AArch64FI; + + AArch64FunctionInfo *AArch64FI = nullptr; /// \brief Emit the LOHs contained in AArch64FI. void EmitLOHs(); - typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol; + /// Emit instruction to set float register to zero. + void EmitFMov0(const MachineInstr &MI); + + using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>; + MInstToMCSymbol LOHInstToLabel; }; -} // end of anonymous namespace +} // end anonymous namespace -//===----------------------------------------------------------------------===// +void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) +{ + EmitSled(MI, SledKind::FUNCTION_ENTER); +} + +void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) +{ + EmitSled(MI, SledKind::FUNCTION_EXIT); +} + +void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) +{ + EmitSled(MI, SledKind::TAIL_CALL); +} + +void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) +{ + static const int8_t NoopsInSledCount = 7; + // We want to emit the following pattern: + // + // .Lxray_sled_N: + // ALIGN + // B #32 + // ; 7 NOP instructions (28 bytes) + // .tmpN + // + // We need the 28 bytes (7 instructions) because at runtime, we'd be patching + // over the full 32 bytes (8 instructions) with the following pattern: + // + // STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack + // LDR W0, #12 ; W0 := function ID + // LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit + // BLR X16 ; call the tracing trampoline + // ;DATA: 32 bits of function ID + // ;DATA: lower 32 bits of the address of the trampoline + // ;DATA: higher 32 bits of the address of the trampoline + // LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack + // + OutStreamer->EmitCodeAlignment(4); + auto CurSled = OutContext.createTempSymbol("xray_sled_", true); + OutStreamer->EmitLabel(CurSled); + auto Target = OutContext.createTempSymbol(); + + // Emit "B #32" instruction, which jumps over the next 28 bytes. + // The operand has to be the number of 4-byte instructions to jump over, + // including the current instruction. + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8)); + + for (int8_t I = 0; I < NoopsInSledCount; I++) + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + + OutStreamer->EmitLabel(Target); + recordSled(CurSled, MI, Kind); +} void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); @@ -131,19 +210,29 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); SM.serializeToStackMapSection(); } -} -MachineLocation -AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + if (TT.isOSBinFormatCOFF()) { + const auto &TLOF = + static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering()); + + std::string Flags; + raw_string_ostream OS(Flags); + + for (const auto &Function : M) + TLOF.emitLinkerFlagsForGlobal(OS, &Function); + for (const auto &Global : M.globals()) + TLOF.emitLinkerFlagsForGlobal(OS, &Global); + for (const auto &Alias : M.aliases()) + TLOF.emitLinkerFlagsForGlobal(OS, &Alias); + + OS.flush(); + + // Output collected flags + if (!Flags.empty()) { + OutStreamer->SwitchSection(TLOF.getDrectveSection()); + OutStreamer->EmitBytes(Flags); + } } - return Location; } void AArch64AsmPrinter::EmitLOHs() { @@ -171,7 +260,7 @@ MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const { // Darwin uses a linker-private symbol name for constant-pools (to // avoid addends on the relocation?), ELF has no such concept and // uses a normal private symbol. - if (getDataLayout().getLinkerPrivateGlobalPrefix()[0]) + if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty()) return OutContext.getOrCreateSymbol( Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber()) + "_" + Twine(CPID)); @@ -238,8 +327,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = - MF->getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const TargetRegisterInfo *RI = STI->getRegisterInfo(); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -265,6 +353,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: return true; // Unknown modifier. + case 'a': // Print 'a' modifier + PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); + return false; case 'w': // Print W register case 'x': // Print X register if (MO.isReg()) @@ -333,7 +424,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) + if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a') return true; // Unknown modifier. const MachineOperand &MO = MI->getOperand(OpNum); @@ -364,7 +455,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { - unsigned NumNOPBytes = MI.getOperand(1).getImm(); + unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes(); SM.recordStackMap(MI); assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); @@ -396,7 +487,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, PatchPointOpers Opers(&MI); - int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + int64_t CallTarget = Opers.getCallTarget().getImm(); unsigned EncodedBytes = 0; if (CallTarget) { assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && @@ -404,16 +495,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 16; // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF) .addImm(32)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF) .addImm(16)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF) @@ -421,7 +512,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg)); } // Emit padding. - unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + unsigned NumBytes = Opers.getNumPatchBytes(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); assert((NumBytes - EncodedBytes) % 4 == 0 && @@ -430,6 +521,47 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } +void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { + unsigned DestReg = MI.getOperand(0).getReg(); + if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) { + // Convert H/S/D register to corresponding Q register + if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) + DestReg = AArch64::Q0 + (DestReg - AArch64::H0); + else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) + DestReg = AArch64::Q0 + (DestReg - AArch64::S0); + else { + assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + DestReg = AArch64::Q0 + (DestReg - AArch64::D0); + } + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + MCInst FMov; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVH0: + FMov.setOpcode(AArch64::FMOVWHr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; + } + EmitToStreamer(*OutStreamer, FMov); + } +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" @@ -451,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::MOVIv2d_ns: + // If the target has <rdar://problem/16473581>, lower this + // instruction to movi.16b instead. + if (STI->hasZeroCycleZeroingFPWorkaround() && + MI->getOperand(1).getImm() == 0) { + MCInst TmpInst; + TmpInst.setOpcode(AArch64::MOVIv16b_ns); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm())); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + break; + case AArch64::DBG_VALUE: { if (isVerbose() && OutStreamer->hasRawTextSupport()) { SmallString<128> TmpStr; @@ -491,8 +637,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &MO_Sym = MI->getOperand(0); MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym); MCOperand Sym, SymTLSDescLo12, SymTLSDesc; - MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); + MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE); MCInstLowering.lowerOperand(MO_Sym, Sym); MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12); @@ -535,11 +680,42 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + case AArch64::FMOVH0: + case AArch64::FMOVS0: + case AArch64::FMOVD0: + EmitFMov0(*MI); + return; + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); case TargetOpcode::PATCHPOINT: return LowerPATCHPOINT(*OutStreamer, SM, *MI); + + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: + LowerPATCHABLE_FUNCTION_ENTER(*MI); + return; + + case TargetOpcode::PATCHABLE_FUNCTION_EXIT: + LowerPATCHABLE_FUNCTION_EXIT(*MI); + return; + + case TargetOpcode::PATCHABLE_TAIL_CALL: + LowerPATCHABLE_TAIL_CALL(*MI); + return; + + case AArch64::RETGUARD_JMP_TRAP: + { + MCSymbol *RGSuccSym = OutContext.createTempSymbol(); + /* Compare and branch */ + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::CBZX) + .addReg(MI->getOperand(0).getReg()) + .addExpr(MCSymbolRefExpr::create(RGSuccSym, OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::BRK).addImm(1)); + OutStreamer->EmitLabel(RGSuccSym); + return; + } + } // Finally, do the automated lowerings for everything else. @@ -550,7 +726,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { // Force static initialization. extern "C" void LLVMInitializeAArch64AsmPrinter() { - RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget); - RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget); - RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target); + RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget()); + RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget()); + RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target()); } diff --git a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3f63d049c34..250d4edcc56 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -41,6 +41,10 @@ // | | // |-----------------------------------| // | | +// | (Win64 only) varargs from reg | +// | | +// |-----------------------------------| +// | | // | prev_fp, prev_lr | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) @@ -90,20 +94,44 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64ReturnProtectorLowering.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <vector> using namespace llvm; @@ -115,34 +143,60 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone", STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +/// Look at each instruction that references stack frames and return the stack +/// size limit beyond which some of these instructions will require a scratch +/// register during their expansion later. +static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { + // FIXME: For now, just conservatively guestimate based on unscaled indexing + // range. We'll end up allocating an unnecessary spill slot a lot, but + // realistically that's not a big deal at this stage of the game. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() || MI.isPseudo() || + MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::ADDSXri) + continue; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isFI()) + continue; + + int Offset = 0; + if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == + AArch64FrameOffsetCannotUpdate) + return 0; + } + } + } + return 255; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; // Don't use the red zone if the function explicitly asks us not to. // This is typically used for kernel code. - if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone)) + if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) return false; - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - // Note: currently hasFP() is always true for hasCalls(), but that's an - // implementation detail of the current code, not a strict requirement, - // so stay safe here and check both. - if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) - return false; - return true; + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); } /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasStackMap() || - MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF)); + // Retain behavior of always omitting the FP for leaf functions when possible. + return (MFI.hasCalls() && + MF.getTarget().Options.DisableFramePointerElim(MF)) || + MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is @@ -152,10 +206,10 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { /// included as part of the stack frame. bool AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return !MF.getFrameInfo().hasVarSizedObjects(); } -void AArch64FrameLowering::eliminateCallFramePseudoInstr( +MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = @@ -170,7 +224,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( unsigned Align = getStackAlignment(); int64_t Amount = I->getOperand(0).getImm(); - Amount = RoundUpToAlignment(Amount, Align); + Amount = alignTo(Amount, Align); if (!IsDestroy) Amount = -Amount; @@ -186,7 +240,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses // LSL #0, and the other uses LSL #12. // - // Mostly call frames will be allocated at the start of a function so + // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); @@ -198,106 +252,238 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, TII); } - MBB.erase(I); + return MBB.erase(I); } void AArch64FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const MCRegisterInfo *MRI = STI.getRegisterInfo(); + const TargetInstrInfo *TII = STI.getInstrInfo(); DebugLoc DL = MBB.findDebugLoc(MBBI); // Add callee saved registers to move list. - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) return; - const DataLayout &TD = MF.getDataLayout(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD.getPointerSize(0); - - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; - unsigned TotalSkipped = 0; for (const auto &Info : CSI) { unsigned Reg = Info.getReg(); - int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea() + saveAreaOffset; - - // Don't output a new CFI directive if we're re-saving the frame pointer or - // link register. This happens when the PrologEpilogInserter has inserted an - // extra "STP" of the frame pointer and link register -- the "emitPrologue" - // method automatically generates the directives when frame pointers are - // used. If we generate CFI directives for the extra "STP"s, the linker will - // lose track of the correct values for the frame pointer and link register. - if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { - TotalSkipped += stackGrowth; - continue; - } - + int64_t Offset = + MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, DwarfReg, Offset - TotalSkipped)); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } } -/// Get FPOffset by analyzing the first instruction. -static int getFPOffsetInPrologue(MachineInstr *MBBI) { - // First instruction must a) allocate the stack and b) have an immediate - // that is a multiple of -2. - assert(((MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre) && - MBBI->getOperand(3).getReg() == AArch64::SP && - MBBI->getOperand(4).getImm() < 0 && - (MBBI->getOperand(4).getImm() & 1) == 0)); - - // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space - // required for the callee saved register area we get the frame pointer - // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. - int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; - assert(FPOffset >= 0 && "Bad Framepointer Offset"); - return FPOffset; -} +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + // If MBB is an entry block, use X9 as the scratch register + if (&MF->front() == MBB) + return AArch64::X9; -static bool isCSSave(MachineInstr *MBBI) { - return MBBI->getOpcode() == AArch64::STPXi || - MBBI->getOpcode() == AArch64::STPDi || - MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre; + const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + // Prefer X9 since it was historically used for the prologue scratch reg. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + if (LiveRegs.available(MRI, AArch64::X9)) + return AArch64::X9; + + for (unsigned Reg : AArch64::GPR64RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return AArch64::NoRegister; } bool AArch64FrameLowering::canUseAsPrologue( const MachineBasicBlock &MBB) const { const MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - // Otherwise, we may need a scratch register to be available and we do not - // support that for now. - return !RegInfo->needsStackRealignment(*MF); + if (!RegInfo->needsStackRealignment(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. + return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; +} + +static bool windowsRequiresStackProbe(MachineFunction &MF, + unsigned StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isTargetWindows()) + return false; + const Function &F = MF.getFunction(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. + unsigned StackProbeSize = 4096; + if (F.hasFnAttribute("stack-probe-size")) + F.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackSizeInBytes >= StackProbeSize; +} + +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) + return false; + + if (MFI.hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Convert callee-save register save/restore instruction to do stack pointer +// decrement/increment to allocate/deallocate the callee-save stack area by +// converting store/load to use pre/post increment version. +static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { + unsigned NewOpc; + bool NewIsUnscaled = false; + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Unexpected callee-save save/restore opcode!"); + case AArch64::STPXi: + NewOpc = AArch64::STPXpre; + break; + case AArch64::STPDi: + NewOpc = AArch64::STPDpre; + break; + case AArch64::STRXui: + NewOpc = AArch64::STRXpre; + NewIsUnscaled = true; + break; + case AArch64::STRDui: + NewOpc = AArch64::STRDpre; + NewIsUnscaled = true; + break; + case AArch64::LDPXi: + NewOpc = AArch64::LDPXpost; + break; + case AArch64::LDPDi: + NewOpc = AArch64::LDPDpost; + break; + case AArch64::LDRXui: + NewOpc = AArch64::LDRXpost; + NewIsUnscaled = true; + break; + case AArch64::LDRDui: + NewOpc = AArch64::LDRDpost; + NewIsUnscaled = true; + break; + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + MIB.addReg(AArch64::SP, RegState::Define); + + // Copy all operands other than the immediate offset. + unsigned OpndIdx = 0; + for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; + ++OpndIdx) + MIB.add(MBBI->getOperand(OpndIdx)); + + assert(MBBI->getOperand(OpndIdx).getImm() == 0 && + "Unexpected immediate offset in first/last callee-save save/restore " + "instruction!"); + assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + assert(CSStackSizeInc % 8 == 0); + int64_t CSStackSizeIncImm = CSStackSizeInc; + if (!NewIsUnscaled) + CSStackSizeIncImm /= 8; + MIB.addImm(CSStackSizeIncImm); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump by adding the local stack size to the stack offsets. +static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, + unsigned LocalStackSize) { + unsigned Opc = MI.getOpcode(); + (void)Opc; + assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || + Opc == AArch64::STRXui || Opc == AArch64::STRDui || + Opc == AArch64::LDPXi || Opc == AArch64::LDPDi || + Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) && + "Unexpected callee-save save/restore opcode!"); + + unsigned OffsetIdx = MI.getNumExplicitOperands() - 1; + assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); + // All generated opcodes have scaled offsets. + assert(LocalStackSize % 8 == 0); + OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); } void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const Function *Fn = MF.getFunction(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); + bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry(); bool HasFP = hasFP(MF); // Debug location must be unknown since the first debug location is used @@ -306,50 +492,74 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - int NumBytes = (int)MFI->getStackSize(); - if (!AFI->hasStackFrame()) { + int NumBytes = (int)MFI.getStackSize(); + if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - + if (!NumBytes) + return; // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. - if (NumBytes && !canUseRedZone(MF)) { + if (canUseRedZone(MF)) + ++NumRedZoneFunctions; + else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. - unsigned CFIIndex = MMI.addFrameInst( + unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - } else if (NumBytes) { - ++NumRedZoneFunctions; } - return; } - // Only set up FP if we actually need to. - int FPOffset = 0; - if (HasFP) - FPOffset = getFPOffsetInPrologue(MBBI); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - // Move past the saves of the callee-saved registers. - while (isCSSave(MBBI)) { - ++MBBI; - NumBytes -= 16; + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes - PrologueSaveSize); + + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else if (PrologueSaveSize != 0) { + MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, + -PrologueSaveSize); + NumBytes -= PrologueSaveSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + MachineBasicBlock::iterator End = MBB.end(); + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize()); + ++MBBI; + } if (HasFP) { + // Only set up FP if we actually need to. Frame pointer is fp = + // sp - fixedobject - 16. + int FPOffset = AFI->getCalleeSavedStackSize() - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); + // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". @@ -358,47 +568,84 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); + if (windowsRequiresStackProbe(MF, NumBytes)) { + uint32_t NumWords = NumBytes >> 4; - // Allocate space for the rest of the frame. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup); - const unsigned Alignment = MFI->getMaxAlignment(); - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); - unsigned scratchSPReg = AArch64::SP; - if (NumBytes && NeedsRealignment) { - // Use the first callee-saved register as a scratch register. - scratchSPReg = AArch64::X9; + switch (MF.getTarget().getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Kernel: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol("__chkstk") + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + break; + case CodeModel::Large: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) + .addReg(AArch64::X16, RegState::Define) + .addExternalSymbol("__chkstk") + .addExternalSymbol("__chkstk") + .setMIFlags(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + .addReg(AArch64::X16, RegState::Kill) + .addReg(AArch64::X15, RegState::Implicit | RegState::Define) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP, RegState::Kill) + .addReg(AArch64::X15, RegState::Kill) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) + .setMIFlags(MachineInstr::FrameSetup); + NumBytes = 0; } - // If we're a leaf function, try using the red zone. - if (NumBytes && !canUseRedZone(MF)) - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. + if (NumBytes) { + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + unsigned scratchSPReg = AArch64::SP; + + if (NeedsRealignment) { + scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + assert(scratchSPReg != AArch64::NoRegister); + } + + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); - if (NumBytes && NeedsRealignment) { - const unsigned NrBitsToZero = countTrailingZeros(Alignment); - assert(NrBitsToZero > 1); - assert(scratchSPReg != AArch64::SP); - - // SUB X9, SP, NumBytes - // -- X9 is temporary register, so shouldn't contain any live data here, - // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - // The logical immediates have a non-trivial encoding. The following - // formula computes the encoded immediate with all ones but - // NrBitsToZero zero bits as least significant bits. - uint32_t andMaskEncoded = - (1 <<12) // = N - | ((64-NrBitsToZero) << 6) // immr - | ((64-NrBitsToZero-1) << 0) // imms - ; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(andMaskEncoded); + if (NeedsRealignment) { + const unsigned Alignment = MFI.getMaxAlignment(); + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(NrBitsToZero > 1); + assert(scratchSPReg != AArch64::SP); + + // SUB X9, SP, NumBytes + // -- X9 is temporary register, so shouldn't contain any live data here, + // -- free to use. This is already produced by emitFrameOffset above. + // AND SP, X9, 0b11111...0000 + // The logical immediates have a non-trivial encoding. The following + // formula computes the encoded immediate with all ones but + // NrBitsToZero zero bits as least significant bits. + uint32_t andMaskEncoded = (1 << 12) // = N + | ((64 - NrBitsToZero) << 6) // immr + | ((64 - NrBitsToZero - 1) << 0); // imms + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + AFI->setStackRealigned(true); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -486,73 +733,31 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored LR - unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored FP - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, Reg, 2 * StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } - // Now emit the moves for whatever callee saved regs we have. - emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). + emitCalleeSavedFrameMoves(MBB, MBBI); } } -static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -/// Checks whether the given instruction restores callee save registers -/// and if so returns how many. -static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { - unsigned RtIdx = 0; - switch (MI.getOpcode()) { - case AArch64::LDPXpost: - case AArch64::LDPDpost: - RtIdx = 1; - // FALLTHROUGH - case AArch64::LDPXi: - case AArch64::LDPDi: - if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || - MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) - return 0; - return 2; - } - return 0; -} - void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; @@ -562,12 +767,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri; } - int NumBytes = MFI->getStackSize(); + int NumBytes = MFI.getStackSize(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; // Initial and residual are named for consistency with the prologue. Note that @@ -599,7 +804,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 8) | | | + // | (CalleeSavedStackSize)| | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -614,41 +819,79 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - NumBytes += ArgumentPopSize; - unsigned NumRestores = 0; + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + + if (!CombineSPBump && PrologueSaveSize != 0) + convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize); + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); - NumRestores += Restores; - if (Restores == 0) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize()); + } + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; } - NumBytes -= NumRestores * 8; + + NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { + bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the - // stack pointer. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, - TII); - return; + // stack pointer (but we may need to pop stack args for fastcc). + if (RedZone && ArgumentPopSize == 0) + return; + + bool NoCalleeSaveRestore = PrologueSaveSize == 0; + int StackRestoreBytes = RedZone ? 0 : NumBytes; + if (NoCalleeSaveRestore) + StackRestoreBytes += ArgumentPopSize; + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackRestoreBytes, TII, MachineInstr::FrameDestroy); + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || ArgumentPopSize == 0) + return; + NumBytes = 0; } // Restore the original stack pointer. // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (NumBytes || MFI->hasVarSizedObjects()) + if (MFI.hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); + -AFI->getCalleeSavedStackSize() + 16, TII, + MachineInstr::FrameDestroy); + else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, + MachineInstr::FrameDestroy); + + // This must be placed after the callee-save restore code because that code + // assumes the SP is at the same location as it was after the callee-save save + // code in the prologue. + if (ArgumentPopSize) + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + ArgumentPopSize, TII, MachineInstr::FrameDestroy); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -664,13 +907,17 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - int FPOffset = MFI->getObjectOffset(FI) + 16; - int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); - bool isFixed = MFI->isFixedObjectIndex(FI); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16; + int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); + bool isFixed = MFI.isFixedObjectIndex(FI); // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -695,7 +942,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, // using the FP regardless, though, as the SP offset is unknown // and we don't have a base pointer available. If an offset is // available via the FP and the SP, use whichever is closest. - if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 || + if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 || (FPOffset >= -256 && Offset > -FPOffset)) UseFP = true; } @@ -726,156 +973,234 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { - if (Reg != AArch64::LR) - return getKillRegState(true); + // Do not set a kill flag on values that are also marked as live-in. This + // happens with the @llvm-returnaddress intrinsic and with arguments passed in + // callee saved registers. + // Omitting the kill flags is conservatively correct even if the live-in + // is not used after all. + bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); + return getKillRegState(!IsLiveIn); +} - // LR maybe referred to later by an @llvm.returnaddress intrinsic. - bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); - bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); - return getKillRegState(LRKill); +static bool produceCompactUnwindFrame(MachineFunction &MF) { + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + AttributeList Attrs = MF.getFunction().getAttributes(); + return Subtarget.isTargetMachO() && + !(Subtarget.getTargetLowering()->supportSwiftError() && + Attrs.hasAttrSomewhere(Attribute::SwiftError)); } -bool AArch64FrameLowering::spillCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); +namespace { + +struct RegPairInfo { + unsigned Reg1 = AArch64::NoRegister; + unsigned Reg2 = AArch64::NoRegister; + int FrameIdx; + int Offset; + bool IsGPR; + + RegPairInfo() = default; + + bool isPaired() const { return Reg2 != AArch64::NoRegister; } +}; + +} // end anonymous namespace + +static void computeCalleeSaveRegisterPairs( + MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) { + + if (CSI.empty()) + return; + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + CallingConv::ID CC = MF.getFunction().getCallingConv(); unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + (void)CC; + // MachO's compact unwind format relies on all registers being stored in + // pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (Count & 1) == 0) && + "Odd number of callee-saved regs to spill!"); + int Offset = AFI->getCalleeSavedStackSize(); + + for (unsigned i = 0; i < Count; ++i) { + RegPairInfo RPI; + RPI.Reg1 = CSI[i].getReg(); + + assert(AArch64::GPR64RegClass.contains(RPI.Reg1) || + AArch64::FPR64RegClass.contains(RPI.Reg1)); + RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1); + + // Add the next reg to the pair if it is in the same register class. + if (i + 1 < Count) { + unsigned NextReg = CSI[i + 1].getReg(); + if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) || + (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg))) + RPI.Reg2 = NextReg; + } - for (unsigned i = 0; i < Count; i += 2) { - unsigned idx = Count - i - 2; - unsigned Reg1 = CSI[idx].getReg(); - unsigned Reg2 = CSI[idx + 1].getReg(); // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. // // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. - assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + assert((!RPI.isPaired() || + (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + + // MachO's compact unwind format relies on all registers being stored in + // adjacent register pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (RPI.isPaired() && + ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || + RPI.Reg1 + 1 == RPI.Reg2))) && + "Callee-save registers not saved as adjacent register pair!"); + + RPI.FrameIdx = CSI[i].getFrameIdx(); + + if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + Offset -= 16; + assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); + MFI.setObjectAlignment(RPI.FrameIdx, 16); + AFI->setCalleeSaveStackHasFreeSpace(true); + } else + Offset -= RPI.isPaired() ? 16 : 8; + assert(Offset % 8 == 0); + RPI.Offset = Offset / 8; + assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + "Offset out of bounds for LDP/STP immediate"); + + RegPairs.push_back(RPI); + if (RPI.isPaired()) + ++i; + } +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + SmallVector<RegPairInfo, 8> RegPairs; + + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; unsigned StrOpc; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. + + // Issue sequence of spills for cs regs. The first spill may be converted + // to a pre-decrement store later by emitPrologue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x22, x21, [sp, #0] // addImm(+0) // stp x20, x19, [sp, #16] // addImm(+2) // stp fp, lr, [sp, #32] // addImm(+4) // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! - // Note: Similar rational and sequence for restores in epilog. - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPXpre; - else - StrOpc = AArch64::STPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPDpre; - else - StrOpc = AArch64::STPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() - << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); - // Compute offset: i = 0 => offset = -Count; - // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. - const int Offset = (i == 0) ? -Count : i; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for STP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) - MIB.addReg(AArch64::SP, RegState::Define); + // Note: Similar rationale and sequence for restores in epilog. + if (RPI.IsGPR) + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + else + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); - MBB.addLiveIn(Reg1); - MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); + if (RPI.isPaired()) { + if (!MRI.isReserved(Reg2)) + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOStore, 8, 8)); + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOStore, 8, 8)); } return true; } bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - unsigned Count = CSI.size(); DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + SmallVector<RegPairInfo, 8> RegPairs; if (MI != MBB.end()) DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { - unsigned Reg1 = CSI[i].getReg(); - unsigned Reg2 = CSI[i + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && - "Out of order callee saved regs!"); - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; + + // Issue sequence of restores for cs regs. The last restore may be converted + // to a post-increment load later by emitEpilogue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: // ldp fp, lr, [sp, #32] // addImm(+4) // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) + // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; + if (RPI.IsGPR) + LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; + else + LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; + DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPXpost; - else - LdrOpc = AArch64::LDPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPDpost; - else - LdrOpc = AArch64::LDPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() - << ", " << CSI[i + 1].getFrameIdx() << ")\n"); - - // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; - // etc. - const int Offset = (i == Count - 2) ? Count : Count - i - 2; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for LDP immediate"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) - MIB.addReg(AArch64::SP, RegState::Define); - - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) + if (RPI.isPaired()) { + MIB.addReg(Reg2, getDefRegState(true)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOLoad, 8, 8)); + } + MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) - .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOLoad, 8, 8)); } return true; } @@ -885,107 +1210,93 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, RegScavenger *RS) const { // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - SmallVector<unsigned, 4> UnspilledCSGPRs; - SmallVector<unsigned, 4> UnspilledCSFPRs; + unsigned UnspilledCSGPR = AArch64::NoRegister; + unsigned UnspilledCSGPRPaired = AArch64::NoRegister; + + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + + unsigned BasePointerReg = RegInfo->hasBasePointer(MF) + ? RegInfo->getBaseRegister() + : (unsigned)AArch64::NoRegister; + + unsigned SpillEstimate = SavedRegs.count(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + unsigned PairedReg = CSRegs[i ^ 1]; + if (Reg == BasePointerReg) + SpillEstimate++; + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) + SpillEstimate++; + } + + if (MFI.hasReturnProtectorRegister()) { + SavedRegs.set(MFI.getReturnProtectorRegister()); + SpillEstimate++; + } + + SpillEstimate += 2; // Conservatively include FP+LR in the estimate + unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF)) { + if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) { SavedRegs.set(AArch64::FP); SavedRegs.set(AArch64::LR); } - // Spill the BasePtr if it's used. Do this first thing so that the - // getCalleeSavedRegs() below will get the right answer. - if (RegInfo->hasBasePointer(MF)) - SavedRegs.set(RegInfo->getBaseRegister()); - - if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - SavedRegs.set(AArch64::X9); + unsigned ExtraCSSpill = 0; + // Figure out which callee-saved registers to save/restore. + for (unsigned i = 0; CSRegs[i]; ++i) { + const unsigned Reg = CSRegs[i]; - // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumGPRSpilled = 0; - unsigned NumFPRSpilled = 0; - bool ExtraCSSpill = false; - bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) + SavedRegs.set(Reg); - // Check pairs of consecutive callee-saved registers. - for (unsigned i = 0; CSRegs[i]; i += 2) { - assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); - - const unsigned OddReg = CSRegs[i]; - const unsigned EvenReg = CSRegs[i + 1]; - assert((AArch64::GPR64RegClass.contains(OddReg) && - AArch64::GPR64RegClass.contains(EvenReg)) ^ - (AArch64::FPR64RegClass.contains(OddReg) && - AArch64::FPR64RegClass.contains(EvenReg)) && - "Register class mismatch!"); - - const bool OddRegUsed = SavedRegs.test(OddReg); - const bool EvenRegUsed = SavedRegs.test(EvenReg); - - // Early exit if none of the registers in the register pair is actually - // used. - if (!OddRegUsed && !EvenRegUsed) { - if (AArch64::GPR64RegClass.contains(OddReg)) { - UnspilledCSGPRs.push_back(OddReg); - UnspilledCSGPRs.push_back(EvenReg); - } else { - UnspilledCSFPRs.push_back(OddReg); - UnspilledCSFPRs.push_back(EvenReg); + bool RegUsed = SavedRegs.test(Reg); + unsigned PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { + if (AArch64::GPR64RegClass.contains(Reg) && + !RegInfo->isReservedReg(MF, Reg)) { + UnspilledCSGPR = Reg; + UnspilledCSGPRPaired = PairedReg; } continue; } - unsigned Reg = AArch64::NoRegister; - // If only one of the registers of the register pair is used, make sure to - // mark the other one as used as well. - if (OddRegUsed ^ EvenRegUsed) { - // Find out which register is the additional spill. - Reg = OddRegUsed ? EvenReg : OddReg; - SavedRegs.set(Reg); + // MachO's compact unwind format relies on all registers being stored in + // pairs. + // FIXME: the usual format is actually better if unwinding isn't needed. + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) { + SavedRegs.set(PairedReg); + if (AArch64::GPR64RegClass.contains(PairedReg) && + !RegInfo->isReservedReg(MF, PairedReg)) + ExtraCSSpill = PairedReg; } + } - DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); - DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); - - assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || - (RegInfo->getEncodingValue(OddReg) + 1 == - RegInfo->getEncodingValue(EvenReg))) && - "Register pair of non-adjacent registers!"); - if (AArch64::GPR64RegClass.contains(OddReg)) { - NumGPRSpilled += 2; - // If it's not a reserved register, we can use it in lieu of an - // emergency spill slot for the register scavenger. - // FIXME: It would be better to instead keep looking and choose another - // unspilled register that isn't reserved, if there is one. - if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } else - NumFPRSpilled += 2; + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + for (unsigned Reg : SavedRegs.set_bits()) + dbgs() << ' ' << printReg(Reg, RegInfo); + dbgs() << "\n";); - CanEliminateFrame = false; - } + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumRegsSpilled = SavedRegs.count(); + bool CanEliminateFrame = NumRegsSpilled == 0; - // FIXME: Set BigStack if any stack slot references may be out of range. - // For now, just conservatively guestimate based on unscaled indexing - // range. We'll end up allocating an unnecessary spill slot a lot, but - // realistically that's not a big deal at this stage of the game. // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. - MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned CFSize = - MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); - bool BigStack = (CFSize >= 256); + unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); + bool BigStack = (CFSize > EstimatedStackSizeLimit); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -995,30 +1306,45 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // register scavenging. If we already spilled an extra callee-saved register // above to keep the number of spills even, we don't need to do anything else // here. - if (BigStack && !ExtraCSSpill) { - - // If we're adding a register to spill here, we have to add two of them - // to keep the number of regs to spill even. - assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); - unsigned Count = 0; - while (!UnspilledCSGPRs.empty() && Count < 2) { - unsigned Reg = UnspilledCSGPRs.back(); - UnspilledCSGPRs.pop_back(); - DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) + if (BigStack) { + if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { + DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo) << " to get a scratch register.\n"); - SavedRegs.set(Reg); - ExtraCSSpill = true; - ++Count; + SavedRegs.set(UnspilledCSGPR); + // MachO's compact unwind format relies on all registers being stored in + // pairs, so if we need to spill one extra for BigStack, then we need to + // store the pair. + if (produceCompactUnwindFrame(MF)) + SavedRegs.set(UnspilledCSGPRPaired); + ExtraCSSpill = UnspilledCSGPRPaired; + NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create // an emergency spill slot. - if (!ExtraCSSpill) { - const TargetRegisterClass *RC = &AArch64::GPR64RegClass; - int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false); + if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass &RC = AArch64::GPR64RegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + int FI = MFI.CreateStackObject(Size, Align, false); RS->addScavengingFrameIndex(FI); DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); } } + + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. + AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); +} + +bool AArch64FrameLowering::enableStackSlotScavenging( + const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return AFI->hasCalleeSaveStackFreeSpace(); +} + +const ReturnProtectorLowering *AArch64FrameLowering::getReturnProtector() const { + return &RPL; } diff --git a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 7d8354c3878..fcbbc1434f1 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -14,29 +14,34 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "AArch64ReturnProtectorLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class AArch64FrameLowering : public TargetFrameLowering { public: + + const AArch64ReturnProtectorLowering RPL; + explicit AArch64FrameLowering() : TargetFrameLowering(StackGrowsDown, 16, 0, 16, - true /*StackRealignable*/) {} + true /*StackRealignable*/), RPL() {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI) const; - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const override; + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + const ReturnProtectorLowering *getReturnProtector() const override; + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, @@ -51,7 +56,7 @@ public: bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const override; /// \brief Can this function use the red zone for local allocations. @@ -67,6 +72,12 @@ public: bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + + bool enableStackSlotScavenging(const MachineFunction &MF) const override; + +private: + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d02bc9ff394..276d664ac61 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -18,23 +18,46 @@ def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; +def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, + AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON", "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasDotProd : Predicate<"Subtarget->hasDotProd()">, + AssemblerPredicate<"FeatureDotProd", "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasLSE : Predicate<"Subtarget->hasLSE()">, + AssemblerPredicate<"FeatureLSE", "lse">; +def HasRAS : Predicate<"Subtarget->hasRAS()">, + AssemblerPredicate<"FeatureRAS", "ras">; +def HasRDM : Predicate<"Subtarget->hasRDM()">, + AssemblerPredicate<"FeatureRDM", "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; def HasSPE : Predicate<"Subtarget->hasSPE()">, AssemblerPredicate<"FeatureSPE", "spe">; +def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, + AssemblerPredicate<"FeatureFuseAES", + "fuse-aes">; +def HasSVE : Predicate<"Subtarget->hasSVE()">, + AssemblerPredicate<"FeatureSVE", "sve">; +def HasRCPC : Predicate<"Subtarget->hasRCPC()">, + AssemblerPredicate<"FeatureRCPC", "rcpc">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; -def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def UseAlternateSExtLoadCVTF32 + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; + +def UseNegativeImmediates + : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + "NegativeImmediates">; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -144,7 +167,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", - SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, [SDNPHasChain, SDNPOutGlue]>; def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", SDCallSeqEnd<[ SDTCisVT<0, i32>, @@ -283,6 +307,11 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>; +def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; +def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -295,15 +324,18 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -// -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; -def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; -def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; -def ForCodeSize : Predicate<"ForCodeSize">; -def NotForCodeSize : Predicate<"!ForCodeSize">; +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def ForCodeSize : Predicate<"MF->getFunction().optForSize()">; + def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">; + // Avoid generating STRQro if it is slow, unless we're optimizing for code size. + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">; +} include "AArch64InstrFormats.td" +include "SVEInstrFormats.td" //===----------------------------------------------------------------------===// @@ -312,10 +344,14 @@ include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>; +// We set Sched to empty list because we expect these instructions to simply get +// removed in most cases. +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_start timm:$amt1, timm:$amt2)]>, + Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(AArch64callseq_end timm:$amt1, timm:$amt2)]>; + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, + Sched<[]>; } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { @@ -363,6 +399,12 @@ def MOVaddrEXT } // isReMaterializable, isCodeGenOnly +//===----------------------------------------------------------------------===// +// Pseudo instruction used by retguard +let isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in { + def RETGUARD_JMP_TRAP: Pseudo<(outs), (ins GPR64:$reg), []>; +} + def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr), (LOADgot tglobaltlsaddr:$addr)>; @@ -383,6 +425,7 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; // v8.2a Statistical Profiling extension def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; @@ -404,6 +447,108 @@ def ISB : CRmSystemI<barrier_op, 0b110, "isb", [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>; } +// ARMv8.2 Dot Product +let Predicates = [HasDotProd] in { +def UDOT2S : BaseSIMDThreeSameVectorDot<0, 1, "udot", ".2s", ".8b">; +def SDOT2S : BaseSIMDThreeSameVectorDot<0, 0, "sdot", ".2s", ".8b">; +def UDOT4S : BaseSIMDThreeSameVectorDot<1, 1, "udot", ".4s", ".16b">; +def SDOT4S : BaseSIMDThreeSameVectorDot<1, 0, "sdot", ".4s", ".16b">; +def UDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 1, "udot", ".2s", ".8b", ".4b">; +def SDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 0, "sdot", ".2s", ".8b", ".4b">; +def UDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 1, "udot", ".4s", ".16b", ".4b">; +def SDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 0, "sdot", ".4s", ".16b", ".4b">; +} + +let Predicates = [HasRCPC] in { + // v8.3 Release Consistent Processor Consistent support, optional in v8.2. + def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>; + def LDAPRH : RCPCLoad<0b01, "ldaprh", GPR32>; + def LDAPRW : RCPCLoad<0b10, "ldapr", GPR32>; + def LDAPRX : RCPCLoad<0b11, "ldapr", GPR64>; +} + +// v8.3a complex add and multiply-accumulate. No predicate here, that is done +// inside the multiclass as the FP16 versions need different predicates. +defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop, + "fcmla", null_frag>; +defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd, + "fcadd", null_frag>; +defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla", + null_frag>; + +let Predicates = [HasV8_3a] in { + // v8.3a Pointer Authentication + let Uses = [LR], Defs = [LR] in { + def PACIAZ : SystemNoOperands<0b000, "paciaz">; + def PACIBZ : SystemNoOperands<0b010, "pacibz">; + def AUTIAZ : SystemNoOperands<0b100, "autiaz">; + def AUTIBZ : SystemNoOperands<0b110, "autibz">; + } + let Uses = [LR, SP], Defs = [LR] in { + def PACIASP : SystemNoOperands<0b001, "paciasp">; + def PACIBSP : SystemNoOperands<0b011, "pacibsp">; + def AUTIASP : SystemNoOperands<0b101, "autiasp">; + def AUTIBSP : SystemNoOperands<0b111, "autibsp">; + } + let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { + def PACIA1716 : SystemNoOperands<0b000, "pacia1716">; + def PACIB1716 : SystemNoOperands<0b010, "pacib1716">; + def AUTIA1716 : SystemNoOperands<0b100, "autia1716">; + def AUTIB1716 : SystemNoOperands<0b110, "autib1716">; + } + + let Uses = [LR], Defs = [LR], CRm = 0b0000 in { + def XPACLRI : SystemNoOperands<0b111, "xpaclri">; + } + + multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> { + def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>; + def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>; + def DA : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>; + def DB : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>; + def IZA : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>; + def DZA : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>; + def IZB : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>; + def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>; + } + + defm PAC : SignAuth<0b000, 0b010, "pac">; + defm AUT : SignAuth<0b001, 0b011, "aut">; + + def XPACI : SignAuthZero<0b100, 0b00, "xpaci">; + def XPACD : SignAuthZero<0b100, 0b01, "xpacd">; + def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; + + // Combined Instructions + def BRAA : AuthBranchTwoOperands<0, 0, "braa">; + def BRAB : AuthBranchTwoOperands<0, 1, "brab">; + def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; + def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + + def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; + def BRABZ : AuthOneOperand<0b000, 1, "brabz">; + def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; + def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + + let isReturn = 1 in { + def RETAA : AuthReturn<0b010, 0, "retaa">; + def RETAB : AuthReturn<0b010, 1, "retab">; + def ERETAA : AuthReturn<0b100, 0, "eretaa">; + def ERETAB : AuthReturn<0b100, 1, "eretab">; + } + + defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>; + defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>; + + // v8.3a floating point conversion for javascript + let Predicates = [HasV8_3a, HasFPARMv8] in + def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, + "fjcvtzs", []> { + let Inst{31} = 0; + } + +} // HasV8_3A + def : InstAlias<"clrex", (CLREX 0xf)>; def : InstAlias<"isb", (ISB 0xf)>; @@ -414,7 +559,8 @@ def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. -def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), + [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in @@ -439,8 +585,8 @@ let PostEncoderMethod = "fixMOVZ" in defm MOVZ : MoveImmediate<0b10, "movz">; // First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>; def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; @@ -457,10 +603,10 @@ def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>; def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; @@ -468,8 +614,8 @@ def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>; // Final group of aliases covers true "mov $Rd, $imm" cases. multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR, @@ -528,6 +674,12 @@ def i64imm_32bit : ImmLeaf<i64, [{ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm); }]>; +def s64imm_32bit : ImmLeaf<i64, [{ + int64_t Imm64 = static_cast<int64_t>(Imm); + return Imm64 >= std::numeric_limits<int32_t>::min() && + Imm64 <= std::numeric_limits<int32_t>::max(); +}]>; + def trunc_imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32); }]>; @@ -557,31 +709,31 @@ def : Pat<(f64 fpimm:$in), // sequences. def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, tglobaladdr:$g1, tglobaladdr:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), - tglobaladdr:$g2, 32), - tglobaladdr:$g1, 16), - tglobaladdr:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0), + tglobaladdr:$g1, 16), + tglobaladdr:$g2, 32), + tglobaladdr:$g3, 48)>; def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, tblockaddress:$g1, tblockaddress:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), - tblockaddress:$g2, 32), - tblockaddress:$g1, 16), - tblockaddress:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0), + tblockaddress:$g1, 16), + tblockaddress:$g2, 32), + tblockaddress:$g3, 48)>; def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2, tconstpool:$g1, tconstpool:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), - tconstpool:$g2, 32), - tconstpool:$g1, 16), - tconstpool:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0), + tconstpool:$g1, 16), + tconstpool:$g2, 32), + tconstpool:$g3, 48)>; def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2, tjumptable:$g1, tjumptable:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), - tjumptable:$g2, 32), - tjumptable:$g1, 16), - tjumptable:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0), + tjumptable:$g1, 16), + tjumptable:$g2, 32), + tjumptable:$g3, 48)>; //===----------------------------------------------------------------------===// @@ -679,10 +831,11 @@ def : InstAlias<"negs $dst, $src$shift", // Unsigned/Signed divide defm UDIV : Div<0, "udiv", udiv>; defm SDIV : Div<1, "sdiv", sdiv>; -let isCodeGenOnly = 1 in { -defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>; -defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>; -} + +def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>; // Variable shift defm ASRV : Shift<0b10, "asr", sra>; @@ -700,7 +853,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>; def : ShiftAlias<"rorv", RORVXr, GPR64>; // Multiply-add -let AddedComplexity = 7 in { +let AddedComplexity = 5 in { defm MADD : MulAccum<0, "madd", add>; defm MSUB : MulAccum<1, "msub", sub>; @@ -717,7 +870,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; -} // AddedComplexity = 7 +} // AddedComplexity = 5 let AddedComplexity = 5 in { def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; @@ -734,6 +887,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; + +def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)), + GPR64:$Ra)), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; + +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32), + (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; } // AddedComplexity = 5 def : MulAccumWAlias<"mul", MADDWrrr>; @@ -899,10 +1086,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; defm CLS : OneOperandData<0b101, "cls">; defm CLZ : OneOperandData<0b100, "clz", ctlz>; -defm RBIT : OneOperandData<0b000, "rbit">; - -def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>; -def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>; +defm RBIT : OneOperandData<0b000, "rbit", bitreverse>; def REV16Wr : OneWRegData<0b001, "rev16", UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; @@ -1085,10 +1269,26 @@ def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV), (CSINCWr WZR, WZR, (i32 imm:$cc))>; def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV), (CSINCXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV), + (CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV), + (CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), (CSINVWr WZR, WZR, (i32 imm:$cc))>; def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), (CSINVXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; // The inverse of the condition code from the alias instruction is what is used // in the aliased instruction. The parser all ready inverts the condition code @@ -1158,7 +1358,8 @@ def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; // Create a separate pseudo-instruction for codegen to use so that we don't // flag lr as used in every function. It'll be restored before the RET by the // epilogue if it's legitimately used. -def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { +def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>, + Sched<[WriteBrReg]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; @@ -1168,7 +1369,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction // (which in the usual case is a BLR). let hasSideEffects = 1 in -def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { +def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> { let AsmString = ".tlsdesccall $sym"; } @@ -1178,7 +1379,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, isCodeGenOnly = 1 in def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), - [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>; + [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>, + Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>; def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), (TLSDESC_CALLSEQ texternalsym:$sym)>; @@ -1954,6 +2156,17 @@ defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>; defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; +let Predicates = [UseSTRQro], AddedComplexity = 10 in { + def : Pat<(store (f128 FPR128:$Rt), + (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend)), + (STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>; + def : Pat<(store (f128 FPR128:$Rt), + (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend)), + (STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>; +} + multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop, Instruction STRW, Instruction STRX> { @@ -2001,7 +2214,7 @@ defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>; // Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { +let Predicates = [IsLE, UseSTRQro] in { // We must use ST1 to store vectors in big-endian. defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>; @@ -2042,11 +2255,11 @@ let AddedComplexity = 19 in { //--- // (unsigned immediate) -defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", - [(store GPR64:$Rt, +defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str", + [(store GPR64z:$Rt, (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; -defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str", - [(store GPR32:$Rt, +defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str", + [(store GPR32z:$Rt, (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str", [(store FPR8:$Rt, @@ -2062,12 +2275,12 @@ defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str", (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>; -defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh", - [(truncstorei16 GPR32:$Rt, +defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh", + [(truncstorei16 GPR32z:$Rt, (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>; -defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb", - [(truncstorei8 GPR32:$Rt, +defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb", + [(truncstorei8 GPR32z:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; @@ -2444,13 +2657,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; -defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; + +multiclass FPToIntegerIntPats<Intrinsic round, string INST> { + def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>; + def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>; + def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>; + def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>; + def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>; + def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>; + + def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; } +defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">; +defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">; + multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { def : Pat<(i32 (to_int (round f32:$Rn))), (!cast<Instruction>(INST # UWSr) f32:$Rn)>; @@ -2468,8 +2700,8 @@ defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">; defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">; defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">; defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">; -defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">; -defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">; +defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">; //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. @@ -2485,14 +2717,19 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; defm FMOV : UnscaledConversion<"fmov">; // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable -let isReMaterializable = 1, isCodeGenOnly = 1 in { +let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in { +def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>, + Sched<[WriteF]>; def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; } +// Similarly add aliases +def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>, + Requires<[HasFullFP16]>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>; //===----------------------------------------------------------------------===// // Floating point conversion instruction. @@ -2507,7 +2744,7 @@ defm FCVT : FPConversion<"fcvt">; defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; defm FMOV : SingleOperandFPData<0b0000, "fmov">; defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; -defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>; +defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>; defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>; @@ -2617,6 +2854,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), (i32 imm:$cond), NZCV))]> { let Uses = [NZCV]; let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; } @@ -2635,60 +2873,36 @@ defm FMOV : FPMoveImmediate<"fmov">; defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", int_aarch64_neon_uabd>; // Match UABDL in log2-shuffle patterns. +def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), (v8i16 (add (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), (zext (extract_high_v16i8 V128:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), - (v4i32 (add (sub (zext (v4i16 V64:$opA)), - (zext (v4i16 V64:$opB))), - (AArch64vashr v4i32:$src, (i32 31))))), +def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), - (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), - (zext (extract_high_v8i16 V128:$opB))), - (AArch64vashr v4i32:$src, (i32 31))))), +def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))))), (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; -def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), - (v2i64 (add (sub (zext (v2i32 V64:$opA)), - (zext (v2i32 V64:$opB))), - (AArch64vashr v2i64:$src, (i32 63))))), +def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))))), (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), - (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), - (zext (extract_high_v4i32 V128:$opB))), - (AArch64vashr v2i64:$src, (i32 63))))), +def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))))), (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; -defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; -def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), - (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), - (ABSv8i8 V64:$src)>; -def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))), - (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))), - (ABSv4i16 V64:$src)>; -def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))), - (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))), - (ABSv2i32 V64:$src)>; -def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))), - (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))), - (ABSv16i8 V128:$src)>; -def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))), - (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))), - (ABSv8i16 V128:$src)>; -def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))), - (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))), - (ABSv4i32 V128:$src)>; -def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))), - (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))), - (ABSv2i64 V128:$src)>; - +defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>; @@ -2712,13 +2926,13 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), (i64 4)))), (FCVTLv8i16 V128:$Rn)>; -def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; -def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), +def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; +def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn), (i64 2))))), (FCVTLv4i32 V128:$Rn)>; -def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; -def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn), +def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn), (i64 4))))), (FCVTLv8i16 V128:$Rn)>; @@ -2732,9 +2946,9 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))), def : Pat<(concat_vectors V64:$Rd, (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; -def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; -def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), +def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))), (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; @@ -2742,15 +2956,22 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", int_aarch64_neon_fcvtxn>; defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", - int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", - int_aarch64_neon_fcvtzu>; -} + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>; + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>; + defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; -defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; +defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>; defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>; @@ -3192,7 +3413,7 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl> defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; -let Predicates = [HasV8_1a] in { +let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; def : Pat<(i32 (int_aarch64_neon_sqadd @@ -3253,7 +3474,7 @@ def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd), // Advanced SIMD two scalar instructions. //===----------------------------------------------------------------------===// -defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>; +defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>; defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>; defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; @@ -3318,6 +3539,30 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), + (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), + (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), + (FRECPEv2f64 FPR128:$Rn)>; + +def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (FRECPS32 FPR32:$Rn, FPR32:$Rm)>; +def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FRECPSv2f32 V64:$Rn, V64:$Rm)>; +def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))), + (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>; +def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (FRECPS64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), + (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>; + def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3330,6 +3575,30 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), + (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), + (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), + (FRSQRTEv2f64 FPR128:$Rn)>; + +def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>; +def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>; +def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))), + (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>; +def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), + (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>; + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. @@ -4254,20 +4523,20 @@ def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; // AdvSIMD FMOV def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, @@ -4319,18 +4588,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. -// Complexity is added to break a tie with a plain MOVI. -let AddedComplexity = 1 in { -def : Pat<(f32 fpimm0), - (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; -def : Pat<(f64 fpimm0), - (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; -} - def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; @@ -4845,7 +5102,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -4898,8 +5156,9 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; - + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; + def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), @@ -5177,6 +5436,31 @@ def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>; def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>; def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; +// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required +// for AES fusion on some CPUs. +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { +def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +} + +// Only use constrained versions of AES(I)MC instructions if they are paired with +// AESE/AESD. +def : Pat<(v16i8 (int_aarch64_crypto_aesmc + (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + +def : Pat<(v16i8 (int_aarch64_crypto_aesimc + (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>; def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>; def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>; @@ -5194,15 +5478,8 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0 //---------------------------------------------------------------------------- // FIXME: Like for X86, these should go in their own separate .td file. -// Any instruction that defines a 32-bit result leaves the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. -// FIXME: X86 also checks for CMOV here. Do we need something similar? def def32 : PatLeaf<(i32 GPR32:$src), [{ - return N->getOpcode() != ISD::TRUNCATE && - N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && - N->getOpcode() != ISD::CopyFromReg; + return isDef32(*N); }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, @@ -5982,7 +6259,7 @@ def : NTStore64Pat<v8i8>; def : Pat<(nontemporalstore GPR64:$Rt, (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), - (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32), GPR64sp:$Rn, simm7s4:$offset)>; } // AddedComplexity=10 } // Predicates = [IsLE] @@ -5990,8 +6267,10 @@ def : Pat<(nontemporalstore GPR64:$Rt, // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; - def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; } def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), @@ -6002,3 +6281,4 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; include "AArch64InstrAtomics.td" +include "AArch64SVEInstrInfo.td" diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp new file mode 100644 index 00000000000..9d4bdd9591b --- /dev/null +++ b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp @@ -0,0 +1,123 @@ +//===-- AArch64ReturnProtectorLowering.cpp --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of ReturnProtectorLowering +// class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64ReturnProtectorLowering.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetOptions.h" +#include <cstdlib> + +using namespace llvm; + +void AArch64ReturnProtectorLowering::insertReturnProtectorPrologue( + MachineFunction &MF, MachineBasicBlock &MBB, GlobalVariable *cookie) const { + + MachineBasicBlock::instr_iterator MI = MBB.instr_begin(); + DebugLoc MBBDL = MBB.findDebugLoc(MI); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + unsigned REG = MF.getFrameInfo().getReturnProtectorRegister(); + + MBB.addLiveIn(REG); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::ADRP), REG) + .addGlobalAddress(cookie, 0, AArch64II::MO_PAGE); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::LDRXui), REG) + .addReg(REG) + .addGlobalAddress(cookie, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::EORXrr), REG) + .addReg(REG) + .addReg(AArch64::LR); +} + +void AArch64ReturnProtectorLowering::insertReturnProtectorEpilogue( + MachineFunction &MF, MachineInstr &MI, GlobalVariable *cookie) const { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc MBBDL = MI.getDebugLoc(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + unsigned REG = MF.getFrameInfo().getReturnProtectorRegister(); + + MBB.addLiveIn(REG); + MBB.addLiveIn(AArch64::X9); + // REG holds the cookie we calculated in prologue. We use X9 as a + // scratch reg to pull the random data. XOR REG with LR should yield + // the random data again. Compare REG with X9 to check. + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::EORXrr), REG) + .addReg(REG) + .addReg(AArch64::LR); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::ADRP), AArch64::X9) + .addGlobalAddress(cookie, 0, AArch64II::MO_PAGE); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::LDRXui), AArch64::X9) + .addReg(AArch64::X9) + .addGlobalAddress(cookie, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::SUBSXrr), REG) + .addReg(REG) + .addReg(AArch64::X9); + BuildMI(MBB, MI, MBBDL, TII->get(AArch64::RETGUARD_JMP_TRAP)).addReg(REG); +} + +bool AArch64ReturnProtectorLowering::opcodeIsReturn(unsigned opcode) const { + switch (opcode) { + case AArch64::RET: + case AArch64::RET_ReallyLR: + return true; + default: + return false; + } +} + +void AArch64ReturnProtectorLowering::fillTempRegisters( + MachineFunction &MF, std::vector<unsigned> &TempRegs) const { + + TempRegs.push_back(AArch64::X15); + TempRegs.push_back(AArch64::X14); + TempRegs.push_back(AArch64::X13); + TempRegs.push_back(AArch64::X12); + TempRegs.push_back(AArch64::X11); + TempRegs.push_back(AArch64::X10); +} + +void AArch64ReturnProtectorLowering::saveReturnProtectorRegister( + const MachineFunction &MF, std::vector<CalleeSavedInfo> &CSI) const { + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.getReturnProtectorNeeded()) + return; + + if (!MFI.hasReturnProtectorRegister()) + llvm_unreachable("Saving unset return protector register"); + + // Put the temp reg after FP and LR to avoid layout issues + // with the D registers later. + bool added = false; + for (auto CSRI = CSI.begin(); CSRI != CSI.end(); CSRI++) { + if (CSRI->getReg() != AArch64::FP && CSRI->getReg() != AArch64::LR) { + CSI.insert(CSRI, CalleeSavedInfo(MFI.getReturnProtectorRegister())); + added = true; + break; + } + } + if (!added) + CSI.push_back(CalleeSavedInfo(MFI.getReturnProtectorRegister())); +} diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h new file mode 100644 index 00000000000..47feb002978 --- /dev/null +++ b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h @@ -0,0 +1,52 @@ +//===-- AArch64ReturnProtectorLowering.h - --------------------- -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of ReturnProtectorLowering +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64RETURNPROTECTORLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64RETURNPROTECTORLOWERING_H + +#include "llvm/CodeGen/ReturnProtectorLowering.h" + +namespace llvm { + +class AArch64ReturnProtectorLowering : public ReturnProtectorLowering { +public: + /// insertReturnProtectorPrologue/Epilogue - insert return protector + /// instrumentation in prologue or epilogue. + virtual void + insertReturnProtectorPrologue(MachineFunction &MF, MachineBasicBlock &MBB, + GlobalVariable *cookie) const override; + virtual void + insertReturnProtectorEpilogue(MachineFunction &MF, MachineInstr &MI, + GlobalVariable *cookie) const override; + + /// opcodeIsReturn - Reuturn true is the given opcode is a return + /// instruction needing return protection, false otherwise. + virtual bool opcodeIsReturn(unsigned opcode) const override; + + /// fillTempRegisters - Fill the list of available temp registers we can + /// use as a return protector register. + virtual void + fillTempRegisters(MachineFunction &MF, + std::vector<unsigned> &TempRegs) const override; + + /// saveReturnProtectorRegister - Allows the target to save the + /// CalculationRegister in the CalleeSavedInfo vector if needed. + virtual void + saveReturnProtectorRegister(const MachineFunction &MF, + std::vector<CalleeSavedInfo> &CSI) const override; +}; + +} // namespace llvm + +#endif diff --git a/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp b/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp index 91a656f4a48..5c3953bd478 100644 --- a/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp +++ b/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp @@ -3979,7 +3979,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, RetProtector = 1; } if (RetProtector && - (getToolChain().getArch() == llvm::Triple::x86_64) && + ((getToolChain().getArch() == llvm::Triple::x86_64) || + (getToolChain().getArch() == llvm::Triple::aarch64)) && !Args.hasArg(options::OPT_fno_stack_protector) && !Args.hasArg(options::OPT_pg)) { CmdArgs.push_back(Args.MakeArgString("-D_RET_PROTECTOR")); diff --git a/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile b/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile index 059c87c027d..c7333b25152 100644 --- a/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile +++ b/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile,v 1.6 2018/04/06 14:44:04 patrick Exp $ +# $OpenBSD: Makefile,v 1.7 2018/08/12 17:07:00 mortimer Exp $ LIB= LLVMAArch64CodeGen NOPIC= @@ -36,6 +36,7 @@ SRCS= AArch64A57FPLoadBalancing.cpp \ AArch64RedundantCopyElimination.cpp \ AArch64RegisterBankInfo.cpp \ AArch64RegisterInfo.cpp \ + AArch64ReturnProtectorLowering.cpp \ AArch64SIMDInstrOpt.cpp \ AArch64SelectionDAGInfo.cpp \ AArch64StorePairSuppress.cpp \ |