summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp266
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp1134
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h27
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td602
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp123
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h52
-rw-r--r--gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp3
-rw-r--r--gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile3
8 files changed, 1590 insertions, 620 deletions
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index ada995bad37..3fbd9892e30 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/gnu/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
+//===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,34 +12,47 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/AArch64AddressingModes.h"
#include "AArch64.h"
#include "AArch64MCInstLower.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
+#include "AArch64TargetObjectFile.h"
#include "InstPrinter/AArch64InstPrinter.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <memory>
+
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
@@ -49,15 +62,14 @@ namespace {
class AArch64AsmPrinter : public AsmPrinter {
AArch64MCInstLower MCInstLowering;
StackMaps SM;
+ const AArch64Subtarget *STI;
public:
AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
- SM(*this), AArch64FI(nullptr) {}
+ SM(*this) {}
- const char *getPassName() const override {
- return "AArch64 Assembly Printer";
- }
+ StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
/// \brief Wrapper for MCInstLowering.lowerOperand() for the
/// tblgen'erated pseudo lowering.
@@ -69,6 +81,13 @@ public:
const MachineInstr &MI);
void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI);
+
+ void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+ void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+ void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+
+ void EmitSled(const MachineInstr &MI, SledKind Kind);
+
/// \brief tblgen'erated driver function for lowering simple MI->MC
/// pseudo instructions.
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -83,11 +102,13 @@ public:
bool runOnMachineFunction(MachineFunction &F) override {
AArch64FI = F.getInfo<AArch64FunctionInfo>();
- return AsmPrinter::runOnMachineFunction(F);
+ STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
+ bool Result = AsmPrinter::runOnMachineFunction(F);
+ emitXRayTable();
+ return Result;
}
private:
- MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
bool printAsmRegInClass(const MachineOperand &MO,
@@ -107,18 +128,76 @@ private:
MCSymbol *GetCPISymbol(unsigned CPID) const override;
void EmitEndOfAsmFile(Module &M) override;
- AArch64FunctionInfo *AArch64FI;
+
+ AArch64FunctionInfo *AArch64FI = nullptr;
/// \brief Emit the LOHs contained in AArch64FI.
void EmitLOHs();
- typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+ /// Emit instruction to set float register to zero.
+ void EmitFMov0(const MachineInstr &MI);
+
+ using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
+
MInstToMCSymbol LOHInstToLabel;
};
-} // end of anonymous namespace
+} // end anonymous namespace
-//===----------------------------------------------------------------------===//
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
+{
+ EmitSled(MI, SledKind::TAIL_CALL);
+}
+
+void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
+{
+ static const int8_t NoopsInSledCount = 7;
+ // We want to emit the following pattern:
+ //
+ // .Lxray_sled_N:
+ // ALIGN
+ // B #32
+ // ; 7 NOP instructions (28 bytes)
+ // .tmpN
+ //
+ // We need the 28 bytes (7 instructions) because at runtime, we'd be patching
+ // over the full 32 bytes (8 instructions) with the following pattern:
+ //
+ // STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack
+ // LDR W0, #12 ; W0 := function ID
+ // LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit
+ // BLR X16 ; call the tracing trampoline
+ // ;DATA: 32 bits of function ID
+ // ;DATA: lower 32 bits of the address of the trampoline
+ // ;DATA: higher 32 bits of the address of the trampoline
+ // LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
+ //
+ OutStreamer->EmitCodeAlignment(4);
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Emit "B #32" instruction, which jumps over the next 28 bytes.
+ // The operand has to be the number of 4-byte instructions to jump over,
+ // including the current instruction.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8));
+
+ for (int8_t I = 0; I < NoopsInSledCount; I++)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, Kind);
+}
void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
@@ -131,19 +210,29 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
SM.serializeToStackMapSection();
}
-}
-MachineLocation
-AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
- MachineLocation Location;
- assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
- // Frame address. Currently handles register +- offset only.
- if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
- Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
- else {
- DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+ if (TT.isOSBinFormatCOFF()) {
+ const auto &TLOF =
+ static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
+
+ std::string Flags;
+ raw_string_ostream OS(Flags);
+
+ for (const auto &Function : M)
+ TLOF.emitLinkerFlagsForGlobal(OS, &Function);
+ for (const auto &Global : M.globals())
+ TLOF.emitLinkerFlagsForGlobal(OS, &Global);
+ for (const auto &Alias : M.aliases())
+ TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
+
+ OS.flush();
+
+ // Output collected flags
+ if (!Flags.empty()) {
+ OutStreamer->SwitchSection(TLOF.getDrectveSection());
+ OutStreamer->EmitBytes(Flags);
+ }
}
- return Location;
}
void AArch64AsmPrinter::EmitLOHs() {
@@ -171,7 +260,7 @@ MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
// Darwin uses a linker-private symbol name for constant-pools (to
// avoid addends on the relocation?), ELF has no such concept and
// uses a normal private symbol.
- if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+ if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty())
return OutContext.getOrCreateSymbol(
Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
Twine(getFunctionNumber()) + "_" + Twine(CPID));
@@ -238,8 +327,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterClass *RC,
bool isVector, raw_ostream &O) {
assert(MO.isReg() && "Should only get here with a register!");
- const AArch64RegisterInfo *RI =
- MF->getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const TargetRegisterInfo *RI = STI->getRegisterInfo();
unsigned Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
assert(RI->regsOverlap(RegToPrint, Reg));
@@ -265,6 +353,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
return true; // Unknown modifier.
+ case 'a': // Print 'a' modifier
+ PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ return false;
case 'w': // Print W register
case 'x': // Print X register
if (MO.isReg())
@@ -333,7 +424,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
- if (ExtraCode && ExtraCode[0])
+ if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
return true; // Unknown modifier.
const MachineOperand &MO = MI->getOperand(OpNum);
@@ -364,7 +455,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
- unsigned NumNOPBytes = MI.getOperand(1).getImm();
+ unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
SM.recordStackMap(MI);
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
@@ -396,7 +487,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
PatchPointOpers Opers(&MI);
- int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+ int64_t CallTarget = Opers.getCallTarget().getImm();
unsigned EncodedBytes = 0;
if (CallTarget) {
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
@@ -404,16 +495,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 16;
// Materialize the jump address:
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
.addReg(ScratchReg)
.addImm((CallTarget >> 32) & 0xFFFF)
.addImm(32));
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm((CallTarget >> 16) & 0xFFFF)
.addImm(16));
- EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm(CallTarget & 0xFFFF)
@@ -421,7 +512,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
}
// Emit padding.
- unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+ unsigned NumBytes = Opers.getNumPatchBytes();
assert(NumBytes >= EncodedBytes &&
"Patchpoint can't request size less than the length of a call.");
assert((NumBytes - EncodedBytes) % 4 == 0 &&
@@ -430,6 +521,47 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
}
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) {
+ // Convert H/S/D register to corresponding Q register
+ if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
+ DestReg = AArch64::Q0 + (DestReg - AArch64::H0);
+ else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
+ DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+ else {
+ assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+ }
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVH0:
+ FMov.setOpcode(AArch64::FMOVWHr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
+ }
+ EmitToStreamer(*OutStreamer, FMov);
+ }
+}
+
// Simple pseudo-instructions have their lowering (with expansion to real
// instructions) auto-generated.
#include "AArch64GenMCPseudoLowering.inc"
@@ -451,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default:
break;
+ case AArch64::MOVIv2d_ns:
+ // If the target has <rdar://problem/16473581>, lower this
+ // instruction to movi.16b instead.
+ if (STI->hasZeroCycleZeroingFPWorkaround() &&
+ MI->getOperand(1).getImm() == 0) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(AArch64::MOVIv16b_ns);
+ TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ break;
+
case AArch64::DBG_VALUE: {
if (isVerbose() && OutStreamer->hasRawTextSupport()) {
SmallString<128> TmpStr;
@@ -491,8 +637,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineOperand &MO_Sym = MI->getOperand(0);
MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
- MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
- AArch64II::MO_NC);
+ MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
MCInstLowering.lowerOperand(MO_Sym, Sym);
MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
@@ -535,11 +680,42 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ case AArch64::FMOVH0:
+ case AArch64::FMOVS0:
+ case AArch64::FMOVD0:
+ EmitFMov0(*MI);
+ return;
+
case TargetOpcode::STACKMAP:
return LowerSTACKMAP(*OutStreamer, SM, *MI);
case TargetOpcode::PATCHPOINT:
return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+ LowerPATCHABLE_FUNCTION_ENTER(*MI);
+ return;
+
+ case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+ LowerPATCHABLE_FUNCTION_EXIT(*MI);
+ return;
+
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ LowerPATCHABLE_TAIL_CALL(*MI);
+ return;
+
+ case AArch64::RETGUARD_JMP_TRAP:
+ {
+ MCSymbol *RGSuccSym = OutContext.createTempSymbol();
+ /* Compare and branch */
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::CBZX)
+ .addReg(MI->getOperand(0).getReg())
+ .addExpr(MCSymbolRefExpr::create(RGSuccSym, OutContext)));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::BRK).addImm(1));
+ OutStreamer->EmitLabel(RGSuccSym);
+ return;
+ }
+
}
// Finally, do the automated lowerings for everything else.
@@ -550,7 +726,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
// Force static initialization.
extern "C" void LLVMInitializeAArch64AsmPrinter() {
- RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
- RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
- RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target);
+ RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
+ RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
+ RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
}
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3f63d049c34..250d4edcc56 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
// | |
// |-----------------------------------|
// | |
+// | (Win64 only) varargs from reg |
+// | |
+// |-----------------------------------|
+// | |
// | prev_fp, prev_lr |
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
@@ -90,20 +94,44 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64ReturnProtectorLowering.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
using namespace llvm;
@@ -115,34 +143,60 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+/// Look at each instruction that references stack frames and return the stack
+/// size limit beyond which some of these instructions will require a scratch
+/// register during their expansion later.
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
+ // FIXME: For now, just conservatively guestimate based on unscaled indexing
+ // range. We'll end up allocating an unnecessary spill slot a lot, but
+ // realistically that's not a big deal at this stage of the game.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() || MI.isPseudo() ||
+ MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::ADDSXri)
+ continue;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isFI())
+ continue;
+
+ int Offset = 0;
+ if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
+ AArch64FrameOffsetCannotUpdate)
+ return 0;
+ }
+ }
+ }
+ return 255;
+}
+
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
// Don't use the red zone if the function explicitly asks us not to.
// This is typically used for kernel code.
- if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone))
+ if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
return false;
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned NumBytes = AFI->getLocalStackSize();
- // Note: currently hasFP() is always true for hasCalls(), but that's an
- // implementation detail of the current code, not a strict requirement,
- // so stay safe here and check both.
- if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
- return false;
- return true;
+ return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
}
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register.
bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
- return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
- MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF));
+ // Retain behavior of always omitting the FP for leaf functions when possible.
+ return (MFI.hasCalls() &&
+ MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+ MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ RegInfo->needsStackRealignment(MF);
}
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -152,10 +206,10 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
/// included as part of the stack frame.
bool
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo()->hasVarSizedObjects();
+ return !MF.getFrameInfo().hasVarSizedObjects();
}
-void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
@@ -170,7 +224,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
unsigned Align = getStackAlignment();
int64_t Amount = I->getOperand(0).getImm();
- Amount = RoundUpToAlignment(Amount, Align);
+ Amount = alignTo(Amount, Align);
if (!IsDestroy)
Amount = -Amount;
@@ -186,7 +240,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
// LSL #0, and the other uses LSL #12.
//
- // Mostly call frames will be allocated at the start of a function so
+ // Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
@@ -198,106 +252,238 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
TII);
}
- MBB.erase(I);
+ return MBB.erase(I);
}
void AArch64FrameLowering::emitCalleeSavedFrameMoves(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- unsigned FramePtr) const {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- MachineModuleInfo &MMI = MF.getMMI();
- const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const MCRegisterInfo *MRI = STI.getRegisterInfo();
+ const TargetInstrInfo *TII = STI.getInstrInfo();
DebugLoc DL = MBB.findDebugLoc(MBBI);
// Add callee saved registers to move list.
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
return;
- const DataLayout &TD = MF.getDataLayout();
- bool HasFP = hasFP(MF);
-
- // Calculate amount of bytes used for return address storing.
- int stackGrowth = -TD.getPointerSize(0);
-
- // Calculate offsets.
- int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
- unsigned TotalSkipped = 0;
for (const auto &Info : CSI) {
unsigned Reg = Info.getReg();
- int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
- getOffsetOfLocalArea() + saveAreaOffset;
-
- // Don't output a new CFI directive if we're re-saving the frame pointer or
- // link register. This happens when the PrologEpilogInserter has inserted an
- // extra "STP" of the frame pointer and link register -- the "emitPrologue"
- // method automatically generates the directives when frame pointers are
- // used. If we generate CFI directives for the extra "STP"s, the linker will
- // lose track of the correct values for the frame pointer and link register.
- if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
- TotalSkipped += stackGrowth;
- continue;
- }
-
+ int64_t Offset =
+ MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
- nullptr, DwarfReg, Offset - TotalSkipped));
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
}
}
-/// Get FPOffset by analyzing the first instruction.
-static int getFPOffsetInPrologue(MachineInstr *MBBI) {
- // First instruction must a) allocate the stack and b) have an immediate
- // that is a multiple of -2.
- assert(((MBBI->getOpcode() == AArch64::STPXpre ||
- MBBI->getOpcode() == AArch64::STPDpre) &&
- MBBI->getOperand(3).getReg() == AArch64::SP &&
- MBBI->getOperand(4).getImm() < 0 &&
- (MBBI->getOperand(4).getImm() & 1) == 0));
-
- // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space
- // required for the callee saved register area we get the frame pointer
- // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
- int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
- assert(FPOffset >= 0 && "Bad Framepointer Offset");
- return FPOffset;
-}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+ MachineFunction *MF = MBB->getParent();
+
+ // If MBB is an entry block, use X9 as the scratch register
+ if (&MF->front() == MBB)
+ return AArch64::X9;
-static bool isCSSave(MachineInstr *MBBI) {
- return MBBI->getOpcode() == AArch64::STPXi ||
- MBBI->getOpcode() == AArch64::STPDi ||
- MBBI->getOpcode() == AArch64::STPXpre ||
- MBBI->getOpcode() == AArch64::STPDpre;
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
+ LiveRegs.addLiveIns(*MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ // Prefer X9 since it was historically used for the prologue scratch reg.
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ if (LiveRegs.available(MRI, AArch64::X9))
+ return AArch64::X9;
+
+ for (unsigned Reg : AArch64::GPR64RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ return AArch64::NoRegister;
}
bool AArch64FrameLowering::canUseAsPrologue(
const MachineBasicBlock &MBB) const {
const MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// Don't need a scratch register if we're not going to re-align the stack.
- // Otherwise, we may need a scratch register to be available and we do not
- // support that for now.
- return !RegInfo->needsStackRealignment(*MF);
+ if (!RegInfo->needsStackRealignment(*MF))
+ return true;
+ // Otherwise, we can use any block as long as it has a scratch register
+ // available.
+ return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+static bool windowsRequiresStackProbe(MachineFunction &MF,
+ unsigned StackSizeInBytes) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isTargetWindows())
+ return false;
+ const Function &F = MF.getFunction();
+ // TODO: When implementing stack protectors, take that into account
+ // for the probe threshold.
+ unsigned StackProbeSize = 4096;
+ if (F.hasFnAttribute("stack-probe-size"))
+ F.getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ return StackSizeInBytes >= StackProbeSize;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+ MachineFunction &MF, unsigned StackBumpBytes) const {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+ if (AFI->getLocalStackSize() == 0)
+ return false;
+
+ // 512 is the maximum immediate for stp/ldp that will be used for
+ // callee-save save/restores
+ if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
+ return false;
+
+ if (MFI.hasVarSizedObjects())
+ return false;
+
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // This isn't strictly necessary, but it simplifies things a bit since the
+ // current RedZone handling code assumes the SP is adjusted by the
+ // callee-save save/restore code.
+ if (canUseRedZone(MF))
+ return false;
+
+ return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+ unsigned NewOpc;
+ bool NewIsUnscaled = false;
+ switch (MBBI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected callee-save save/restore opcode!");
+ case AArch64::STPXi:
+ NewOpc = AArch64::STPXpre;
+ break;
+ case AArch64::STPDi:
+ NewOpc = AArch64::STPDpre;
+ break;
+ case AArch64::STRXui:
+ NewOpc = AArch64::STRXpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::STRDui:
+ NewOpc = AArch64::STRDpre;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDPXi:
+ NewOpc = AArch64::LDPXpost;
+ break;
+ case AArch64::LDPDi:
+ NewOpc = AArch64::LDPDpost;
+ break;
+ case AArch64::LDRXui:
+ NewOpc = AArch64::LDRXpost;
+ NewIsUnscaled = true;
+ break;
+ case AArch64::LDRDui:
+ NewOpc = AArch64::LDRDpost;
+ NewIsUnscaled = true;
+ break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+ MIB.addReg(AArch64::SP, RegState::Define);
+
+ // Copy all operands other than the immediate offset.
+ unsigned OpndIdx = 0;
+ for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+ ++OpndIdx)
+ MIB.add(MBBI->getOperand(OpndIdx));
+
+ assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+ "Unexpected immediate offset in first/last callee-save save/restore "
+ "instruction!");
+ assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ assert(CSStackSizeInc % 8 == 0);
+ int64_t CSStackSizeIncImm = CSStackSizeInc;
+ if (!NewIsUnscaled)
+ CSStackSizeIncImm /= 8;
+ MIB.addImm(CSStackSizeIncImm);
+
+ MIB.setMIFlags(MBBI->getFlags());
+ MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+ return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+ unsigned LocalStackSize) {
+ unsigned Opc = MI.getOpcode();
+ (void)Opc;
+ assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+ Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+ Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+ Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+ "Unexpected callee-save save/restore opcode!");
+
+ unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+ assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+ "Unexpected base register in callee-save save/restore instruction!");
+ // Last operand is immediate offset that needs fixing.
+ MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+ // All generated opcodes have scaled offsets.
+ assert(LocalStackSize % 8 == 0);
+ OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
}
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const Function *Fn = MF.getFunction();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Function &F = MF.getFunction();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+ bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry();
bool HasFP = hasFP(MF);
// Debug location must be unknown since the first debug location is used
@@ -306,50 +492,74 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
- if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
- int NumBytes = (int)MFI->getStackSize();
- if (!AFI->hasStackFrame()) {
+ int NumBytes = (int)MFI.getStackSize();
+ if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
assert(!HasFP && "unexpected function without stack frame but with FP");
// All of the stack allocation is for locals.
AFI->setLocalStackSize(NumBytes);
- // Label used to tie together the PROLOG_LABEL and the MachineMoves.
- MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
+ if (!NumBytes)
+ return;
// REDZONE: If the stack size is less than 128 bytes, we don't need
// to actually allocate.
- if (NumBytes && !canUseRedZone(MF)) {
+ if (canUseRedZone(MF))
+ ++NumRedZoneFunctions;
+ else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
MachineInstr::FrameSetup);
+ // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+ MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
// Encode the stack size of the leaf function.
- unsigned CFIIndex = MMI.addFrameInst(
+ unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
- } else if (NumBytes) {
- ++NumRedZoneFunctions;
}
-
return;
}
- // Only set up FP if we actually need to.
- int FPOffset = 0;
- if (HasFP)
- FPOffset = getFPOffsetInPrologue(MBBI);
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
- // Move past the saves of the callee-saved registers.
- while (isCSSave(MBBI)) {
- ++MBBI;
- NumBytes -= 16;
+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+ // All of the remaining stack allocations are for locals.
+ AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
+
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+ NumBytes = 0;
+ } else if (PrologueSaveSize != 0) {
+ MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+ -PrologueSaveSize);
+ NumBytes -= PrologueSaveSize;
}
assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ // Move past the saves of the callee-saved registers, fixing up the offsets
+ // and pre-inc if we decided to combine the callee-save and local stack
+ // pointer bump above.
+ MachineBasicBlock::iterator End = MBB.end();
+ while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+ ++MBBI;
+ }
if (HasFP) {
+ // Only set up FP if we actually need to. Frame pointer is fp =
+ // sp - fixedobject - 16.
+ int FPOffset = AFI->getCalleeSavedStackSize() - 16;
+ if (CombineSPBump)
+ FPOffset += AFI->getLocalStackSize();
+
// Issue sub fp, sp, FPOffset or
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
@@ -358,47 +568,84 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup);
}
- // All of the remaining stack allocations are for locals.
- AFI->setLocalStackSize(NumBytes);
+ if (windowsRequiresStackProbe(MF, NumBytes)) {
+ uint32_t NumWords = NumBytes >> 4;
- // Allocate space for the rest of the frame.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+ .addImm(NumWords)
+ .setMIFlags(MachineInstr::FrameSetup);
- const unsigned Alignment = MFI->getMaxAlignment();
- const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
- unsigned scratchSPReg = AArch64::SP;
- if (NumBytes && NeedsRealignment) {
- // Use the first callee-saved register as a scratch register.
- scratchSPReg = AArch64::X9;
+ switch (MF.getTarget().getCodeModel()) {
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ case CodeModel::Kernel:
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addExternalSymbol("__chkstk")
+ .addReg(AArch64::X15, RegState::Implicit)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ case CodeModel::Large:
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
+ .addReg(AArch64::X16, RegState::Define)
+ .addExternalSymbol("__chkstk")
+ .addExternalSymbol("__chkstk")
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
+ .addReg(AArch64::X16, RegState::Kill)
+ .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+ .setMIFlags(MachineInstr::FrameSetup);
+ break;
+ }
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+ .addReg(AArch64::SP, RegState::Kill)
+ .addReg(AArch64::X15, RegState::Kill)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
+ .setMIFlags(MachineInstr::FrameSetup);
+ NumBytes = 0;
}
- // If we're a leaf function, try using the red zone.
- if (NumBytes && !canUseRedZone(MF))
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup);
+ // Allocate space for the rest of the frame.
+ if (NumBytes) {
+ const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+ unsigned scratchSPReg = AArch64::SP;
+
+ if (NeedsRealignment) {
+ scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(scratchSPReg != AArch64::NoRegister);
+ }
+
+ // If we're a leaf function, try using the red zone.
+ if (!canUseRedZone(MF))
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+ // the correct value here, as NumBytes also includes padding bytes,
+ // which shouldn't be counted here.
+ emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
- if (NumBytes && NeedsRealignment) {
- const unsigned NrBitsToZero = countTrailingZeros(Alignment);
- assert(NrBitsToZero > 1);
- assert(scratchSPReg != AArch64::SP);
-
- // SUB X9, SP, NumBytes
- // -- X9 is temporary register, so shouldn't contain any live data here,
- // -- free to use. This is already produced by emitFrameOffset above.
- // AND SP, X9, 0b11111...0000
- // The logical immediates have a non-trivial encoding. The following
- // formula computes the encoded immediate with all ones but
- // NrBitsToZero zero bits as least significant bits.
- uint32_t andMaskEncoded =
- (1 <<12) // = N
- | ((64-NrBitsToZero) << 6) // immr
- | ((64-NrBitsToZero-1) << 0) // imms
- ;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
- .addReg(scratchSPReg, RegState::Kill)
- .addImm(andMaskEncoded);
+ if (NeedsRealignment) {
+ const unsigned Alignment = MFI.getMaxAlignment();
+ const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ assert(NrBitsToZero > 1);
+ assert(scratchSPReg != AArch64::SP);
+
+ // SUB X9, SP, NumBytes
+ // -- X9 is temporary register, so shouldn't contain any live data here,
+ // -- free to use. This is already produced by emitFrameOffset above.
+ // AND SP, X9, 0b11111...0000
+ // The logical immediates have a non-trivial encoding. The following
+ // formula computes the encoded immediate with all ones but
+ // NrBitsToZero zero bits as least significant bits.
+ uint32_t andMaskEncoded = (1 << 12) // = N
+ | ((64 - NrBitsToZero) << 6) // immr
+ | ((64 - NrBitsToZero - 1) << 0); // imms
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+ .addReg(scratchSPReg, RegState::Kill)
+ .addImm(andMaskEncoded);
+ AFI->setStackRealigned(true);
+ }
}
// If we need a base pointer, set it up here. It's whatever the value of the
@@ -486,73 +733,31 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (HasFP) {
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
-
- // Record the location of the stored LR
- unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
- CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
-
- // Record the location of the stored FP
- CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, Reg, 2 * StackGrowth - FixedObject));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
} else {
// Encode the stack size of the leaf function.
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
}
- // Now emit the moves for whatever callee saved regs we have.
- emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
+ // Now emit the moves for whatever callee saved regs we have (including FP,
+ // LR if those are saved).
+ emitCalleeSavedFrameMoves(MBB, MBBI);
}
}
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
- for (unsigned i = 0; CSRegs[i]; ++i)
- if (Reg == CSRegs[i])
- return true;
- return false;
-}
-
-/// Checks whether the given instruction restores callee save registers
-/// and if so returns how many.
-static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
- unsigned RtIdx = 0;
- switch (MI.getOpcode()) {
- case AArch64::LDPXpost:
- case AArch64::LDPDpost:
- RtIdx = 1;
- // FALLTHROUGH
- case AArch64::LDPXi:
- case AArch64::LDPDi:
- if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
- !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
- MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
- return 0;
- return 2;
- }
- return 0;
-}
-
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool IsTailCallReturn = false;
@@ -562,12 +767,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
RetOpcode == AArch64::TCRETURNri;
}
- int NumBytes = MFI->getStackSize();
+ int NumBytes = MFI.getStackSize();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
- if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
// Initial and residual are named for consistency with the prologue. Note that
@@ -599,7 +804,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
- // | (NumRestores * 8) | | |
+ // | (CalleeSavedStackSize)| | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
@@ -614,41 +819,79 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
//
// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
// it as the 2nd argument of AArch64ISD::TC_RETURN.
- NumBytes += ArgumentPopSize;
- unsigned NumRestores = 0;
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+ bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+ if (!CombineSPBump && PrologueSaveSize != 0)
+ convertCalleeSaveRestoreToSPPrePostIncDec(
+ MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
+
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
- const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
--LastPopI;
- unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
- NumRestores += Restores;
- if (Restores == 0) {
+ if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
++LastPopI;
break;
- }
+ } else if (CombineSPBump)
+ fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+ }
+
+ // If there is a single SP update, insert it before the ret and we're done.
+ if (CombineSPBump) {
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ NumBytes + ArgumentPopSize, TII,
+ MachineInstr::FrameDestroy);
+ return;
}
- NumBytes -= NumRestores * 8;
+
+ NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
if (!hasFP(MF)) {
+ bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
- // stack pointer.
- if (!canUseRedZone(MF))
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
- TII);
- return;
+ // stack pointer (but we may need to pop stack args for fastcc).
+ if (RedZone && ArgumentPopSize == 0)
+ return;
+
+ bool NoCalleeSaveRestore = PrologueSaveSize == 0;
+ int StackRestoreBytes = RedZone ? 0 : NumBytes;
+ if (NoCalleeSaveRestore)
+ StackRestoreBytes += ArgumentPopSize;
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ // If we were able to combine the local stack pop with the argument pop,
+ // then we're done.
+ if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+ return;
+ NumBytes = 0;
}
// Restore the original stack pointer.
// FIXME: Rather than doing the math here, we should instead just use
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
- if (NumBytes || MFI->hasVarSizedObjects())
+ if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
+ -AFI->getCalleeSavedStackSize() + 16, TII,
+ MachineInstr::FrameDestroy);
+ else if (NumBytes)
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+ MachineInstr::FrameDestroy);
+
+ // This must be placed after the callee-save restore code because that code
+ // assumes the SP is at the same location as it was after the callee-save save
+ // code in the prologue.
+ if (ArgumentPopSize)
+ emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ ArgumentPopSize, TII, MachineInstr::FrameDestroy);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -664,13 +907,17 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
int FI, unsigned &FrameReg,
bool PreferFP) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- int FPOffset = MFI->getObjectOffset(FI) + 16;
- int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
- bool isFixed = MFI->isFixedObjectIndex(FI);
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
+ int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+ bool isFixed = MFI.isFixedObjectIndex(FI);
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -695,7 +942,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
// using the FP regardless, though, as the SP offset is unknown
// and we don't have a base pointer available. If an offset is
// available via the FP and the SP, use whichever is closest.
- if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+ if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 ||
(FPOffset >= -256 && Offset > -FPOffset))
UseFP = true;
}
@@ -726,156 +973,234 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
- if (Reg != AArch64::LR)
- return getKillRegState(true);
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+ // callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+ return getKillRegState(!IsLiveIn);
+}
- // LR maybe referred to later by an @llvm.returnaddress intrinsic.
- bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
- bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
- return getKillRegState(LRKill);
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ AttributeList Attrs = MF.getFunction().getAttributes();
+ return Subtarget.isTargetMachO() &&
+ !(Subtarget.getTargetLowering()->supportSwiftError() &&
+ Attrs.hasAttrSomewhere(Attribute::SwiftError));
}
-bool AArch64FrameLowering::spillCalleeSavedRegisters(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+namespace {
+
+struct RegPairInfo {
+ unsigned Reg1 = AArch64::NoRegister;
+ unsigned Reg2 = AArch64::NoRegister;
+ int FrameIdx;
+ int Offset;
+ bool IsGPR;
+
+ RegPairInfo() = default;
+
+ bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+
+} // end anonymous namespace
+
+static void computeCalleeSaveRegisterPairs(
+ MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+ if (CSI.empty())
+ return;
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
unsigned Count = CSI.size();
- DebugLoc DL;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ (void)CC;
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (Count & 1) == 0) &&
+ "Odd number of callee-saved regs to spill!");
+ int Offset = AFI->getCalleeSavedStackSize();
+
+ for (unsigned i = 0; i < Count; ++i) {
+ RegPairInfo RPI;
+ RPI.Reg1 = CSI[i].getReg();
+
+ assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+ AArch64::FPR64RegClass.contains(RPI.Reg1));
+ RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+ // Add the next reg to the pair if it is in the same register class.
+ if (i + 1 < Count) {
+ unsigned NextReg = CSI[i + 1].getReg();
+ if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+ (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+ RPI.Reg2 = NextReg;
+ }
- for (unsigned i = 0; i < Count; i += 2) {
- unsigned idx = Count - i - 2;
- unsigned Reg1 = CSI[idx].getReg();
- unsigned Reg2 = CSI[idx + 1].getReg();
// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
// list to come in sorted by frame index so that we can issue the store
// pair instructions directly. Assert if we see anything otherwise.
//
// The order of the registers in the list is controlled by
// getCalleeSavedRegs(), so they will always be in-order, as well.
- assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+ assert((!RPI.isPaired() ||
+ (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
"Out of order callee saved regs!");
+
+ // MachO's compact unwind format relies on all registers being stored in
+ // adjacent register pairs.
+ assert((!produceCompactUnwindFrame(MF) ||
+ CC == CallingConv::PreserveMost ||
+ (RPI.isPaired() &&
+ ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+ RPI.Reg1 + 1 == RPI.Reg2))) &&
+ "Callee-save registers not saved as adjacent register pair!");
+
+ RPI.FrameIdx = CSI[i].getFrameIdx();
+
+ if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+ // Round up size of non-pair to pair size if we need to pad the
+ // callee-save area to ensure 16-byte alignment.
+ Offset -= 16;
+ assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
+ MFI.setObjectAlignment(RPI.FrameIdx, 16);
+ AFI->setCalleeSaveStackHasFreeSpace(true);
+ } else
+ Offset -= RPI.isPaired() ? 16 : 8;
+ assert(Offset % 8 == 0);
+ RPI.Offset = Offset / 8;
+ assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+ "Offset out of bounds for LDP/STP immediate");
+
+ RegPairs.push_back(RPI);
+ if (RPI.isPaired())
+ ++i;
+ }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ DebugLoc DL;
+ SmallVector<RegPairInfo, 8> RegPairs;
+
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
unsigned StrOpc;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
- // Issue sequence of non-sp increment and pi sp spills for cs regs. The
- // first spill is a pre-increment that allocates the stack.
+
+ // Issue sequence of spills for cs regs. The first spill may be converted
+ // to a pre-decrement store later by emitPrologue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
// For example:
- // stp x22, x21, [sp, #-48]! // addImm(-6)
+ // stp x22, x21, [sp, #0] // addImm(+0)
// stp x20, x19, [sp, #16] // addImm(+2)
// stp fp, lr, [sp, #32] // addImm(+4)
// Rationale: This sequence saves uop updates compared to a sequence of
// pre-increment spills like stp xi,xj,[sp,#-16]!
- // Note: Similar rational and sequence for restores in epilog.
- if (AArch64::GPR64RegClass.contains(Reg1)) {
- assert(AArch64::GPR64RegClass.contains(Reg2) &&
- "Expected GPR64 callee-saved register pair!");
- // For first spill use pre-increment store.
- if (i == 0)
- StrOpc = AArch64::STPXpre;
- else
- StrOpc = AArch64::STPXi;
- } else if (AArch64::FPR64RegClass.contains(Reg1)) {
- assert(AArch64::FPR64RegClass.contains(Reg2) &&
- "Expected FPR64 callee-saved register pair!");
- // For first spill use pre-increment store.
- if (i == 0)
- StrOpc = AArch64::STPDpre;
- else
- StrOpc = AArch64::STPDi;
- } else
- llvm_unreachable("Unexpected callee saved register!");
- DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
- << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
- << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
- // Compute offset: i = 0 => offset = -Count;
- // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
- const int Offset = (i == 0) ? -Count : i;
- assert((Offset >= -64 && Offset <= 63) &&
- "Offset out of bounds for STP immediate");
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
- MIB.addReg(AArch64::SP, RegState::Define);
+ // Note: Similar rationale and sequence for restores in epilog.
+ if (RPI.IsGPR)
+ StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+ else
+ StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+ DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired())
+ dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
- MBB.addLiveIn(Reg1);
- MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
- .addReg(Reg1, getPrologueDeath(MF, Reg1))
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
+ if (RPI.isPaired()) {
+ if (!MRI.isReserved(Reg2))
+ MBB.addLiveIn(Reg2);
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOStore, 8, 8));
+ }
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
- .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+ .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
.setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOStore, 8, 8));
}
return true;
}
bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- unsigned Count = CSI.size();
DebugLoc DL;
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ SmallVector<RegPairInfo, 8> RegPairs;
if (MI != MBB.end())
DL = MI->getDebugLoc();
- for (unsigned i = 0; i < Count; i += 2) {
- unsigned Reg1 = CSI[i].getReg();
- unsigned Reg2 = CSI[i + 1].getReg();
- // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
- // list to come in sorted by frame index so that we can issue the store
- // pair instructions directly. Assert if we see anything otherwise.
- assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
- "Out of order callee saved regs!");
- // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
- // the last load is sp-pi post-increment and de-allocates the stack:
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+ for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+ ++RPII) {
+ RegPairInfo RPI = *RPII;
+ unsigned Reg1 = RPI.Reg1;
+ unsigned Reg2 = RPI.Reg2;
+
+ // Issue sequence of restores for cs regs. The last restore may be converted
+ // to a post-increment load later by emitEpilogue if the callee-save stack
+ // area allocation can't be combined with the local stack area allocation.
// For example:
// ldp fp, lr, [sp, #32] // addImm(+4)
// ldp x20, x19, [sp, #16] // addImm(+2)
- // ldp x22, x21, [sp], #48 // addImm(+6)
+ // ldp x22, x21, [sp, #0] // addImm(+0)
// Note: see comment in spillCalleeSavedRegisters()
unsigned LdrOpc;
+ if (RPI.IsGPR)
+ LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+ else
+ LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+ DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired())
+ dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx+1;
+ dbgs() << ")\n");
- assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
- if (AArch64::GPR64RegClass.contains(Reg1)) {
- assert(AArch64::GPR64RegClass.contains(Reg2) &&
- "Expected GPR64 callee-saved register pair!");
- if (i == Count - 2)
- LdrOpc = AArch64::LDPXpost;
- else
- LdrOpc = AArch64::LDPXi;
- } else if (AArch64::FPR64RegClass.contains(Reg1)) {
- assert(AArch64::FPR64RegClass.contains(Reg2) &&
- "Expected FPR64 callee-saved register pair!");
- if (i == Count - 2)
- LdrOpc = AArch64::LDPDpost;
- else
- LdrOpc = AArch64::LDPDi;
- } else
- llvm_unreachable("Unexpected callee saved register!");
- DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
- << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
- << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
- // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
- // etc.
- const int Offset = (i == Count - 2) ? Count : Count - i - 2;
- assert((Offset >= -64 && Offset <= 63) &&
- "Offset out of bounds for LDP immediate");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
- if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
- MIB.addReg(AArch64::SP, RegState::Define);
-
- MIB.addReg(Reg2, getDefRegState(true))
- .addReg(Reg1, getDefRegState(true))
+ if (RPI.isPaired()) {
+ MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+ MachineMemOperand::MOLoad, 8, 8));
+ }
+ MIB.addReg(Reg1, getDefRegState(true))
.addReg(AArch64::SP)
- .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8]
- // where the factor * 8 is implicit
+ .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+ MachineMemOperand::MOLoad, 8, 8));
}
return true;
}
@@ -885,107 +1210,93 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
RegScavenger *RS) const {
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
- if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- SmallVector<unsigned, 4> UnspilledCSGPRs;
- SmallVector<unsigned, 4> UnspilledCSFPRs;
+ unsigned UnspilledCSGPR = AArch64::NoRegister;
+ unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+ unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
+ ? RegInfo->getBaseRegister()
+ : (unsigned)AArch64::NoRegister;
+
+ unsigned SpillEstimate = SavedRegs.count();
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned Reg = CSRegs[i];
+ unsigned PairedReg = CSRegs[i ^ 1];
+ if (Reg == BasePointerReg)
+ SpillEstimate++;
+ if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg))
+ SpillEstimate++;
+ }
+
+ if (MFI.hasReturnProtectorRegister()) {
+ SavedRegs.set(MFI.getReturnProtectorRegister());
+ SpillEstimate++;
+ }
+
+ SpillEstimate += 2; // Conservatively include FP+LR in the estimate
+ unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate;
// The frame record needs to be created by saving the appropriate registers
- if (hasFP(MF)) {
+ if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) {
SavedRegs.set(AArch64::FP);
SavedRegs.set(AArch64::LR);
}
- // Spill the BasePtr if it's used. Do this first thing so that the
- // getCalleeSavedRegs() below will get the right answer.
- if (RegInfo->hasBasePointer(MF))
- SavedRegs.set(RegInfo->getBaseRegister());
-
- if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
- SavedRegs.set(AArch64::X9);
+ unsigned ExtraCSSpill = 0;
+ // Figure out which callee-saved registers to save/restore.
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ const unsigned Reg = CSRegs[i];
- // If any callee-saved registers are used, the frame cannot be eliminated.
- unsigned NumGPRSpilled = 0;
- unsigned NumFPRSpilled = 0;
- bool ExtraCSSpill = false;
- bool CanEliminateFrame = true;
- DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
- const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ // Add the base pointer register to SavedRegs if it is callee-save.
+ if (Reg == BasePointerReg)
+ SavedRegs.set(Reg);
- // Check pairs of consecutive callee-saved registers.
- for (unsigned i = 0; CSRegs[i]; i += 2) {
- assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
- const unsigned OddReg = CSRegs[i];
- const unsigned EvenReg = CSRegs[i + 1];
- assert((AArch64::GPR64RegClass.contains(OddReg) &&
- AArch64::GPR64RegClass.contains(EvenReg)) ^
- (AArch64::FPR64RegClass.contains(OddReg) &&
- AArch64::FPR64RegClass.contains(EvenReg)) &&
- "Register class mismatch!");
-
- const bool OddRegUsed = SavedRegs.test(OddReg);
- const bool EvenRegUsed = SavedRegs.test(EvenReg);
-
- // Early exit if none of the registers in the register pair is actually
- // used.
- if (!OddRegUsed && !EvenRegUsed) {
- if (AArch64::GPR64RegClass.contains(OddReg)) {
- UnspilledCSGPRs.push_back(OddReg);
- UnspilledCSGPRs.push_back(EvenReg);
- } else {
- UnspilledCSFPRs.push_back(OddReg);
- UnspilledCSFPRs.push_back(EvenReg);
+ bool RegUsed = SavedRegs.test(Reg);
+ unsigned PairedReg = CSRegs[i ^ 1];
+ if (!RegUsed) {
+ if (AArch64::GPR64RegClass.contains(Reg) &&
+ !RegInfo->isReservedReg(MF, Reg)) {
+ UnspilledCSGPR = Reg;
+ UnspilledCSGPRPaired = PairedReg;
}
continue;
}
- unsigned Reg = AArch64::NoRegister;
- // If only one of the registers of the register pair is used, make sure to
- // mark the other one as used as well.
- if (OddRegUsed ^ EvenRegUsed) {
- // Find out which register is the additional spill.
- Reg = OddRegUsed ? EvenReg : OddReg;
- SavedRegs.set(Reg);
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs.
+ // FIXME: the usual format is actually better if unwinding isn't needed.
+ if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+ SavedRegs.set(PairedReg);
+ if (AArch64::GPR64RegClass.contains(PairedReg) &&
+ !RegInfo->isReservedReg(MF, PairedReg))
+ ExtraCSSpill = PairedReg;
}
+ }
- DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
- DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
- assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
- (RegInfo->getEncodingValue(OddReg) + 1 ==
- RegInfo->getEncodingValue(EvenReg))) &&
- "Register pair of non-adjacent registers!");
- if (AArch64::GPR64RegClass.contains(OddReg)) {
- NumGPRSpilled += 2;
- // If it's not a reserved register, we can use it in lieu of an
- // emergency spill slot for the register scavenger.
- // FIXME: It would be better to instead keep looking and choose another
- // unspilled register that isn't reserved, if there is one.
- if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
- ExtraCSSpill = true;
- } else
- NumFPRSpilled += 2;
+ DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ for (unsigned Reg : SavedRegs.set_bits())
+ dbgs() << ' ' << printReg(Reg, RegInfo);
+ dbgs() << "\n";);
- CanEliminateFrame = false;
- }
+ // If any callee-saved registers are used, the frame cannot be eliminated.
+ unsigned NumRegsSpilled = SavedRegs.count();
+ bool CanEliminateFrame = NumRegsSpilled == 0;
- // FIXME: Set BigStack if any stack slot references may be out of range.
- // For now, just conservatively guestimate based on unscaled indexing
- // range. We'll end up allocating an unnecessary spill slot a lot, but
- // realistically that's not a big deal at this stage of the game.
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
- MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned CFSize =
- MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+ unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
- bool BigStack = (CFSize >= 256);
+ unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
+ bool BigStack = (CFSize > EstimatedStackSizeLimit);
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
AFI->setHasStackFrame(true);
@@ -995,30 +1306,45 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// register scavenging. If we already spilled an extra callee-saved register
// above to keep the number of spills even, we don't need to do anything else
// here.
- if (BigStack && !ExtraCSSpill) {
-
- // If we're adding a register to spill here, we have to add two of them
- // to keep the number of regs to spill even.
- assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
- unsigned Count = 0;
- while (!UnspilledCSGPRs.empty() && Count < 2) {
- unsigned Reg = UnspilledCSGPRs.back();
- UnspilledCSGPRs.pop_back();
- DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+ if (BigStack) {
+ if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
+ DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
<< " to get a scratch register.\n");
- SavedRegs.set(Reg);
- ExtraCSSpill = true;
- ++Count;
+ SavedRegs.set(UnspilledCSGPR);
+ // MachO's compact unwind format relies on all registers being stored in
+ // pairs, so if we need to spill one extra for BigStack, then we need to
+ // store the pair.
+ if (produceCompactUnwindFrame(MF))
+ SavedRegs.set(UnspilledCSGPRPaired);
+ ExtraCSSpill = UnspilledCSGPRPaired;
+ NumRegsSpilled = SavedRegs.count();
}
// If we didn't find an extra callee-saved register to spill, create
// an emergency spill slot.
- if (!ExtraCSSpill) {
- const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
- int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+ if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass &RC = AArch64::GPR64RegClass;
+ unsigned Size = TRI->getSpillSize(RC);
+ unsigned Align = TRI->getSpillAlignment(RC);
+ int FI = MFI.CreateStackObject(Size, Align, false);
RS->addScavengingFrameIndex(FI);
DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
<< " as the emergency spill slot.\n");
}
}
+
+ // Round up to register pair alignment to avoid additional SP adjustment
+ // instructions.
+ AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+ const MachineFunction &MF) const {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasCalleeSaveStackFreeSpace();
+}
+
+const ReturnProtectorLowering *AArch64FrameLowering::getReturnProtector() const {
+ return &RPL;
}
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 7d8354c3878..fcbbc1434f1 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/gnu/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -14,29 +14,34 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "AArch64ReturnProtectorLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class AArch64FrameLowering : public TargetFrameLowering {
public:
+
+ const AArch64ReturnProtectorLowering RPL;
+
explicit AArch64FrameLowering()
: TargetFrameLowering(StackGrowsDown, 16, 0, 16,
- true /*StackRealignable*/) {}
+ true /*StackRealignable*/), RPL() {}
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned FramePtr) const;
+ MachineBasicBlock::iterator MBBI) const;
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ const ReturnProtectorLowering *getReturnProtector() const override;
+
bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
@@ -51,7 +56,7 @@ public:
bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const override;
/// \brief Can this function use the red zone for local allocations.
@@ -67,6 +72,12 @@ public:
bool enableShrinkWrapping(const MachineFunction &MF) const override {
return true;
}
+
+ bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+ bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+ unsigned StackBumpBytes) const;
};
} // End llvm namespace
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d02bc9ff394..276d664ac61 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/gnu/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -18,23 +18,46 @@ def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
+ AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON", "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
+ AssemblerPredicate<"FeatureDotProd", "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
+def HasLSE : Predicate<"Subtarget->hasLSE()">,
+ AssemblerPredicate<"FeatureLSE", "lse">;
+def HasRAS : Predicate<"Subtarget->hasRAS()">,
+ AssemblerPredicate<"FeatureRAS", "ras">;
+def HasRDM : Predicate<"Subtarget->hasRDM()">,
+ AssemblerPredicate<"FeatureRDM", "rdm">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
AssemblerPredicate<"FeatureSPE", "spe">;
+def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
+ AssemblerPredicate<"FeatureFuseAES",
+ "fuse-aes">;
+def HasSVE : Predicate<"Subtarget->hasSVE()">,
+ AssemblerPredicate<"FeatureSVE", "sve">;
+def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
+ AssemblerPredicate<"FeatureRCPC", "rcpc">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
-def IsCyclone : Predicate<"Subtarget->isCyclone()">;
+def UseAlternateSExtLoadCVTF32
+ : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+
+def UseNegativeImmediates
+ : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+ "NegativeImmediates">;
+
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -144,7 +167,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
- SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]>;
def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -283,6 +307,11 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
+def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
+
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -295,15 +324,18 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
//===----------------------------------------------------------------------===//
// AArch64 Instruction Predicate Definitions.
-//
-def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
-def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize : Predicate<"ForCodeSize">;
-def NotForCodeSize : Predicate<"!ForCodeSize">;
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+ def ForCodeSize : Predicate<"MF->getFunction().optForSize()">;
+ def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">;
+ // Avoid generating STRQro if it is slow, unless we're optimizing for code size.
+ def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">;
+}
include "AArch64InstrFormats.td"
+include "SVEInstrFormats.td"
//===----------------------------------------------------------------------===//
@@ -312,10 +344,14 @@ include "AArch64InstrFormats.td"
//===----------------------------------------------------------------------===//
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- [(AArch64callseq_start timm:$amt)]>;
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+ [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
let isReMaterializable = 1, isCodeGenOnly = 1 in {
@@ -363,6 +399,12 @@ def MOVaddrEXT
} // isReMaterializable, isCodeGenOnly
+//===----------------------------------------------------------------------===//
+// Pseudo instruction used by retguard
+let isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+ def RETGUARD_JMP_TRAP: Pseudo<(outs), (ins GPR64:$reg), []>;
+}
+
def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
(LOADgot tglobaltlsaddr:$addr)>;
@@ -383,6 +425,7 @@ def : InstAlias<"wfe", (HINT 0b010)>;
def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
// v8.2a Statistical Profiling extension
def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -404,6 +447,108 @@ def ISB : CRmSystemI<barrier_op, 0b110, "isb",
[(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
}
+// ARMv8.2 Dot Product
+let Predicates = [HasDotProd] in {
+def UDOT2S : BaseSIMDThreeSameVectorDot<0, 1, "udot", ".2s", ".8b">;
+def SDOT2S : BaseSIMDThreeSameVectorDot<0, 0, "sdot", ".2s", ".8b">;
+def UDOT4S : BaseSIMDThreeSameVectorDot<1, 1, "udot", ".4s", ".16b">;
+def SDOT4S : BaseSIMDThreeSameVectorDot<1, 0, "sdot", ".4s", ".16b">;
+def UDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 1, "udot", ".2s", ".8b", ".4b">;
+def SDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 0, "sdot", ".2s", ".8b", ".4b">;
+def UDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 1, "udot", ".4s", ".16b", ".4b">;
+def SDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 0, "sdot", ".4s", ".16b", ".4b">;
+}
+
+let Predicates = [HasRCPC] in {
+ // v8.3 Release Consistent Processor Consistent support, optional in v8.2.
+ def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>;
+ def LDAPRH : RCPCLoad<0b01, "ldaprh", GPR32>;
+ def LDAPRW : RCPCLoad<0b10, "ldapr", GPR32>;
+ def LDAPRX : RCPCLoad<0b11, "ldapr", GPR64>;
+}
+
+// v8.3a complex add and multiply-accumulate. No predicate here, that is done
+// inside the multiclass as the FP16 versions need different predicates.
+defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop,
+ "fcmla", null_frag>;
+defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
+ "fcadd", null_frag>;
+defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
+ null_frag>;
+
+let Predicates = [HasV8_3a] in {
+ // v8.3a Pointer Authentication
+ let Uses = [LR], Defs = [LR] in {
+ def PACIAZ : SystemNoOperands<0b000, "paciaz">;
+ def PACIBZ : SystemNoOperands<0b010, "pacibz">;
+ def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
+ def AUTIBZ : SystemNoOperands<0b110, "autibz">;
+ }
+ let Uses = [LR, SP], Defs = [LR] in {
+ def PACIASP : SystemNoOperands<0b001, "paciasp">;
+ def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
+ def AUTIASP : SystemNoOperands<0b101, "autiasp">;
+ def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
+ }
+ let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
+ def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
+ def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
+ def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
+ def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
+ }
+
+ let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
+ def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
+ }
+
+ multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
+ def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
+ def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
+ def DA : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>;
+ def DB : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>;
+ def IZA : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>;
+ def DZA : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>;
+ def IZB : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>;
+ def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>;
+ }
+
+ defm PAC : SignAuth<0b000, 0b010, "pac">;
+ defm AUT : SignAuth<0b001, 0b011, "aut">;
+
+ def XPACI : SignAuthZero<0b100, 0b00, "xpaci">;
+ def XPACD : SignAuthZero<0b100, 0b01, "xpacd">;
+ def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
+
+ // Combined Instructions
+ def BRAA : AuthBranchTwoOperands<0, 0, "braa">;
+ def BRAB : AuthBranchTwoOperands<0, 1, "brab">;
+ def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">;
+ def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">;
+
+ def BRAAZ : AuthOneOperand<0b000, 0, "braaz">;
+ def BRABZ : AuthOneOperand<0b000, 1, "brabz">;
+ def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
+ def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;
+
+ let isReturn = 1 in {
+ def RETAA : AuthReturn<0b010, 0, "retaa">;
+ def RETAB : AuthReturn<0b010, 1, "retab">;
+ def ERETAA : AuthReturn<0b100, 0, "eretaa">;
+ def ERETAB : AuthReturn<0b100, 1, "eretab">;
+ }
+
+ defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>;
+ defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>;
+
+ // v8.3a floating point conversion for javascript
+ let Predicates = [HasV8_3a, HasFPARMv8] in
+ def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
+ "fjcvtzs", []> {
+ let Inst{31} = 0;
+ }
+
+} // HasV8_3A
+
def : InstAlias<"clrex", (CLREX 0xf)>;
def : InstAlias<"isb", (ISB 0xf)>;
@@ -414,7 +559,8 @@ def MSRpstateImm4 : MSRpstateImm0_15;
// The thread pointer (on Linux, at least, where this has been implemented) is
// TPIDR_EL0.
-def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
+ [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
// The cycle counter PMC register is PMCCNTR_EL0.
let Predicates = [HasPerfMon] in
@@ -439,8 +585,8 @@ let PostEncoderMethod = "fixMOVZ" in
defm MOVZ : MoveImmediate<0b10, "movz">;
// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
@@ -457,10 +603,10 @@ def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
@@ -468,8 +614,8 @@ def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
// Final group of aliases covers true "mov $Rd, $imm" cases.
multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
@@ -528,6 +674,12 @@ def i64imm_32bit : ImmLeaf<i64, [{
return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
}]>;
+def s64imm_32bit : ImmLeaf<i64, [{
+ int64_t Imm64 = static_cast<int64_t>(Imm);
+ return Imm64 >= std::numeric_limits<int32_t>::min() &&
+ Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
def trunc_imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
@@ -557,31 +709,31 @@ def : Pat<(f64 fpimm:$in),
// sequences.
def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
tglobaladdr:$g1, tglobaladdr:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
- tglobaladdr:$g2, 32),
- tglobaladdr:$g1, 16),
- tglobaladdr:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
+ tglobaladdr:$g1, 16),
+ tglobaladdr:$g2, 32),
+ tglobaladdr:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
tblockaddress:$g1, tblockaddress:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
- tblockaddress:$g2, 32),
- tblockaddress:$g1, 16),
- tblockaddress:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
+ tblockaddress:$g1, 16),
+ tblockaddress:$g2, 32),
+ tblockaddress:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
tconstpool:$g1, tconstpool:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
- tconstpool:$g2, 32),
- tconstpool:$g1, 16),
- tconstpool:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
+ tconstpool:$g1, 16),
+ tconstpool:$g2, 32),
+ tconstpool:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
tjumptable:$g1, tjumptable:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
- tjumptable:$g2, 32),
- tjumptable:$g1, 16),
- tjumptable:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
+ tjumptable:$g1, 16),
+ tjumptable:$g2, 32),
+ tjumptable:$g3, 48)>;
//===----------------------------------------------------------------------===//
@@ -679,10 +831,11 @@ def : InstAlias<"negs $dst, $src$shift",
// Unsigned/Signed divide
defm UDIV : Div<0, "udiv", udiv>;
defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
-}
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;
// Variable shift
defm ASRV : Shift<0b10, "asr", sra>;
@@ -700,7 +853,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>;
def : ShiftAlias<"rorv", RORVXr, GPR64>;
// Multiply-add
-let AddedComplexity = 7 in {
+let AddedComplexity = 5 in {
defm MADD : MulAccum<0, "madd", add>;
defm MSUB : MulAccum<1, "msub", sub>;
@@ -717,7 +870,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
+} // AddedComplexity = 5
let AddedComplexity = 5 in {
def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
@@ -734,6 +887,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
(SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
(UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+ (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+ (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+ GPR64:$Ra)),
+ (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+ (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+ (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+ (s64imm_32bit:$C)))),
+ (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+ (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
} // AddedComplexity = 5
def : MulAccumWAlias<"mul", MADDWrrr>;
@@ -899,10 +1086,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
defm CLS : OneOperandData<0b101, "cls">;
defm CLZ : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT : OneOperandData<0b000, "rbit">;
-
-def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
-def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+defm RBIT : OneOperandData<0b000, "rbit", bitreverse>;
def REV16Wr : OneWRegData<0b001, "rev16",
UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
@@ -1085,10 +1269,26 @@ def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
(CSINCWr WZR, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
(CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
(CSINVWr WZR, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
(CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
// The inverse of the condition code from the alias instruction is what is used
// in the aliased instruction. The parser all ready inverts the condition code
@@ -1158,7 +1358,8 @@ def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
// Create a separate pseudo-instruction for codegen to use so that we don't
// flag lr as used in every function. It'll be restored before the RET by the
// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+ Sched<[WriteBrReg]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
@@ -1168,7 +1369,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
// (which in the usual case is a BLR).
let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
let AsmString = ".tlsdesccall $sym";
}
@@ -1178,7 +1379,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
isCodeGenOnly = 1 in
def TLSDESC_CALLSEQ
: Pseudo<(outs), (ins i64imm:$sym),
- [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
+ [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+ Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
(TLSDESC_CALLSEQ texternalsym:$sym)>;
@@ -1954,6 +2156,17 @@ defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>;
defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>;
defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>;
+let Predicates = [UseSTRQro], AddedComplexity = 10 in {
+ def : Pat<(store (f128 FPR128:$Rt),
+ (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend128:$extend)),
+ (STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>;
+ def : Pat<(store (f128 FPR128:$Rt),
+ (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend128:$extend)),
+ (STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>;
+}
+
multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
Instruction STRW, Instruction STRX> {
@@ -2001,7 +2214,7 @@ defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
+let Predicates = [IsLE, UseSTRQro] in {
// We must use ST1 to store vectors in big-endian.
defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
@@ -2042,11 +2255,11 @@ let AddedComplexity = 19 in {
//---
// (unsigned immediate)
-defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
- [(store GPR64:$Rt,
+defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
+ [(store GPR64z:$Rt,
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
-defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
- [(store GPR32:$Rt,
+defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
+ [(store GPR32z:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
[(store FPR8:$Rt,
@@ -2062,12 +2275,12 @@ defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
-defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
- [(truncstorei16 GPR32:$Rt,
+defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
+ [(truncstorei16 GPR32z:$Rt,
(am_indexed16 GPR64sp:$Rn,
uimm12s2:$offset))]>;
-defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb",
- [(truncstorei8 GPR32:$Rt,
+defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb",
+ [(truncstorei8 GPR32z:$Rt,
(am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset))]>;
@@ -2444,13 +2657,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+ def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+ def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+ def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+ def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+ def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+ def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+ def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
}
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
def : Pat<(i32 (to_int (round f32:$Rn))),
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
@@ -2468,8 +2700,8 @@ defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
-defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">;
-defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">;
+defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
@@ -2485,14 +2717,19 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
defm FMOV : UnscaledConversion<"fmov">;
// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
+def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
+ Sched<[WriteF]>;
def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
- PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>,
- Requires<[NoZCZ]>;
+ Sched<[WriteF]>;
def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
- PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>,
- Requires<[NoZCZ]>;
+ Sched<[WriteF]>;
}
+// Similarly add aliases
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
+ Requires<[HasFullFP16]>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;
//===----------------------------------------------------------------------===//
// Floating point conversion instruction.
@@ -2507,7 +2744,7 @@ defm FCVT : FPConversion<"fcvt">;
defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
defm FMOV : SingleOperandFPData<0b0000, "fmov">;
defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
@@ -2617,6 +2854,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
(i32 imm:$cond), NZCV))]> {
let Uses = [NZCV];
let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
}
@@ -2635,60 +2873,36 @@ defm FMOV : FPMoveImmediate<"fmov">;
defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
int_aarch64_neon_uabd>;
// Match UABDL in log2-shuffle patterns.
+def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
+ (zext (v8i8 V64:$opB))))),
+ (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
+ (zext (extract_high_v16i8 V128:$opB))))),
+ (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
(zext (extract_high_v16i8 V128:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
- (v4i32 (add (sub (zext (v4i16 V64:$opA)),
- (zext (v4i16 V64:$opB))),
- (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
+ (zext (v4i16 V64:$opB))))),
(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
- (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
- (zext (extract_high_v8i16 V128:$opB))),
- (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
+ (zext (extract_high_v8i16 V128:$opB))))),
(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
- (v2i64 (add (sub (zext (v2i32 V64:$opA)),
- (zext (v2i32 V64:$opB))),
- (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
+ (zext (v2i32 V64:$opB))))),
(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
- (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
- (zext (extract_high_v4i32 V128:$opB))),
- (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
+ (zext (extract_high_v4i32 V128:$opB))))),
(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
-defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
-def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
- (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
- (ABSv8i8 V64:$src)>;
-def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
- (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
- (ABSv4i16 V64:$src)>;
-def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
- (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
- (ABSv2i32 V64:$src)>;
-def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
- (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
- (ABSv16i8 V128:$src)>;
-def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
- (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
- (ABSv8i16 V128:$src)>;
-def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
- (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
- (ABSv4i32 V128:$src)>;
-def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
- (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
- (ABSv2i64 V128:$src)>;
-
+defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -2712,13 +2926,13 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
(i64 4)))),
(FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
(i64 2))))),
(FCVTLv4i32 V128:$Rn)>;
-def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
(i64 4))))),
(FCVTLv8i16 V128:$Rn)>;
@@ -2732,9 +2946,9 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
def : Pat<(concat_vectors V64:$Rd,
(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
@@ -2742,15 +2956,22 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
int_aarch64_neon_fcvtxn>;
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
- int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
- int_aarch64_neon_fcvtzu>;
-}
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
@@ -3192,7 +3413,7 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
-let Predicates = [HasV8_1a] in {
+let Predicates = [HasRDM] in {
defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
def : Pat<(i32 (int_aarch64_neon_sqadd
@@ -3253,7 +3474,7 @@ def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
// Advanced SIMD two scalar instructions.
//===----------------------------------------------------------------------===//
-defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>;
defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
@@ -3318,6 +3539,30 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+ (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+ (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+ (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+ (FRECPEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+ (FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FRECPSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+ (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+ (FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+ (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
(FRECPXv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3330,6 +3575,30 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+ (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+ (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+ (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+ (FRSQRTEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+ (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+ (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+ (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+ (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
@@ -4254,20 +4523,20 @@ def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
// AdvSIMD FMOV
def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
@@ -4319,18 +4588,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
"movi", ".2d",
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32 fpimm0),
- (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
- Requires<[HasZCZ]>;
-def : Pat<(f64 fpimm0),
- (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
- Requires<[HasZCZ]>;
-}
-
def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
@@ -4845,7 +5102,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+ ssub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4898,8 +5156,9 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
-
+ dsub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -5177,6 +5436,31 @@ def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
+// for AES fusion on some CPUs.
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+ Sched<[WriteV]>;
+def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+ Sched<[WriteV]>;
+}
+
+// Only use constrained versions of AES(I)MC instructions if they are paired with
+// AESE/AESD.
+def : Pat<(v16i8 (int_aarch64_crypto_aesmc
+ (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
+ (v16i8 V128:$src2))))),
+ (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
+ (v16i8 V128:$src2)))))>,
+ Requires<[HasFuseAES]>;
+
+def : Pat<(v16i8 (int_aarch64_crypto_aesimc
+ (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
+ (v16i8 V128:$src2))))),
+ (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
+ (v16i8 V128:$src2)))))>,
+ Requires<[HasFuseAES]>;
+
def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
@@ -5194,15 +5478,8 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0
//----------------------------------------------------------------------------
// FIXME: Like for X86, these should go in their own separate .td file.
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
def def32 : PatLeaf<(i32 GPR32:$src), [{
- return N->getOpcode() != ISD::TRUNCATE &&
- N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
- N->getOpcode() != ISD::CopyFromReg;
+ return isDef32(*N);
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -5982,7 +6259,7 @@ def : NTStore64Pat<v8i8>;
def : Pat<(nontemporalstore GPR64:$Rt,
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
- (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
GPR64sp:$Rn, simm7s4:$offset)>;
} // AddedComplexity=10
} // Predicates = [IsLE]
@@ -5990,8 +6267,10 @@ def : Pat<(nontemporalstore GPR64:$Rt,
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
- def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
- def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+ def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
+ def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+ Sched<[WriteBrReg]>;
}
def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
@@ -6002,3 +6281,4 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
include "AArch64InstrAtomics.td"
+include "AArch64SVEInstrInfo.td"
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp
new file mode 100644
index 00000000000..9d4bdd9591b
--- /dev/null
+++ b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.cpp
@@ -0,0 +1,123 @@
+//===-- AArch64ReturnProtectorLowering.cpp --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of ReturnProtectorLowering
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64ReturnProtectorLowering.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+void AArch64ReturnProtectorLowering::insertReturnProtectorPrologue(
+ MachineFunction &MF, MachineBasicBlock &MBB, GlobalVariable *cookie) const {
+
+ MachineBasicBlock::instr_iterator MI = MBB.instr_begin();
+ DebugLoc MBBDL = MBB.findDebugLoc(MI);
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ unsigned REG = MF.getFrameInfo().getReturnProtectorRegister();
+
+ MBB.addLiveIn(REG);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::ADRP), REG)
+ .addGlobalAddress(cookie, 0, AArch64II::MO_PAGE);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::LDRXui), REG)
+ .addReg(REG)
+ .addGlobalAddress(cookie, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::EORXrr), REG)
+ .addReg(REG)
+ .addReg(AArch64::LR);
+}
+
+void AArch64ReturnProtectorLowering::insertReturnProtectorEpilogue(
+ MachineFunction &MF, MachineInstr &MI, GlobalVariable *cookie) const {
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc MBBDL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ unsigned REG = MF.getFrameInfo().getReturnProtectorRegister();
+
+ MBB.addLiveIn(REG);
+ MBB.addLiveIn(AArch64::X9);
+ // REG holds the cookie we calculated in prologue. We use X9 as a
+ // scratch reg to pull the random data. XOR REG with LR should yield
+ // the random data again. Compare REG with X9 to check.
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::EORXrr), REG)
+ .addReg(REG)
+ .addReg(AArch64::LR);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::ADRP), AArch64::X9)
+ .addGlobalAddress(cookie, 0, AArch64II::MO_PAGE);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::LDRXui), AArch64::X9)
+ .addReg(AArch64::X9)
+ .addGlobalAddress(cookie, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::SUBSXrr), REG)
+ .addReg(REG)
+ .addReg(AArch64::X9);
+ BuildMI(MBB, MI, MBBDL, TII->get(AArch64::RETGUARD_JMP_TRAP)).addReg(REG);
+}
+
+bool AArch64ReturnProtectorLowering::opcodeIsReturn(unsigned opcode) const {
+ switch (opcode) {
+ case AArch64::RET:
+ case AArch64::RET_ReallyLR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void AArch64ReturnProtectorLowering::fillTempRegisters(
+ MachineFunction &MF, std::vector<unsigned> &TempRegs) const {
+
+ TempRegs.push_back(AArch64::X15);
+ TempRegs.push_back(AArch64::X14);
+ TempRegs.push_back(AArch64::X13);
+ TempRegs.push_back(AArch64::X12);
+ TempRegs.push_back(AArch64::X11);
+ TempRegs.push_back(AArch64::X10);
+}
+
+void AArch64ReturnProtectorLowering::saveReturnProtectorRegister(
+ const MachineFunction &MF, std::vector<CalleeSavedInfo> &CSI) const {
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.getReturnProtectorNeeded())
+ return;
+
+ if (!MFI.hasReturnProtectorRegister())
+ llvm_unreachable("Saving unset return protector register");
+
+ // Put the temp reg after FP and LR to avoid layout issues
+ // with the D registers later.
+ bool added = false;
+ for (auto CSRI = CSI.begin(); CSRI != CSI.end(); CSRI++) {
+ if (CSRI->getReg() != AArch64::FP && CSRI->getReg() != AArch64::LR) {
+ CSI.insert(CSRI, CalleeSavedInfo(MFI.getReturnProtectorRegister()));
+ added = true;
+ break;
+ }
+ }
+ if (!added)
+ CSI.push_back(CalleeSavedInfo(MFI.getReturnProtectorRegister()));
+}
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h
new file mode 100644
index 00000000000..47feb002978
--- /dev/null
+++ b/gnu/llvm/lib/Target/AArch64/AArch64ReturnProtectorLowering.h
@@ -0,0 +1,52 @@
+//===-- AArch64ReturnProtectorLowering.h - --------------------- -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of ReturnProtectorLowering
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64RETURNPROTECTORLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64RETURNPROTECTORLOWERING_H
+
+#include "llvm/CodeGen/ReturnProtectorLowering.h"
+
+namespace llvm {
+
+class AArch64ReturnProtectorLowering : public ReturnProtectorLowering {
+public:
+ /// insertReturnProtectorPrologue/Epilogue - insert return protector
+ /// instrumentation in prologue or epilogue.
+ virtual void
+ insertReturnProtectorPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
+ GlobalVariable *cookie) const override;
+ virtual void
+ insertReturnProtectorEpilogue(MachineFunction &MF, MachineInstr &MI,
+ GlobalVariable *cookie) const override;
+
+ /// opcodeIsReturn - Reuturn true is the given opcode is a return
+ /// instruction needing return protection, false otherwise.
+ virtual bool opcodeIsReturn(unsigned opcode) const override;
+
+ /// fillTempRegisters - Fill the list of available temp registers we can
+ /// use as a return protector register.
+ virtual void
+ fillTempRegisters(MachineFunction &MF,
+ std::vector<unsigned> &TempRegs) const override;
+
+ /// saveReturnProtectorRegister - Allows the target to save the
+ /// CalculationRegister in the CalleeSavedInfo vector if needed.
+ virtual void
+ saveReturnProtectorRegister(const MachineFunction &MF,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp b/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp
index 91a656f4a48..5c3953bd478 100644
--- a/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/gnu/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3979,7 +3979,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
RetProtector = 1;
}
if (RetProtector &&
- (getToolChain().getArch() == llvm::Triple::x86_64) &&
+ ((getToolChain().getArch() == llvm::Triple::x86_64) ||
+ (getToolChain().getArch() == llvm::Triple::aarch64)) &&
!Args.hasArg(options::OPT_fno_stack_protector) &&
!Args.hasArg(options::OPT_pg)) {
CmdArgs.push_back(Args.MakeArgString("-D_RET_PROTECTOR"));
diff --git a/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile b/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile
index 059c87c027d..c7333b25152 100644
--- a/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile
+++ b/gnu/usr.bin/clang/libLLVMAArch64CodeGen/Makefile
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.6 2018/04/06 14:44:04 patrick Exp $
+# $OpenBSD: Makefile,v 1.7 2018/08/12 17:07:00 mortimer Exp $
LIB= LLVMAArch64CodeGen
NOPIC=
@@ -36,6 +36,7 @@ SRCS= AArch64A57FPLoadBalancing.cpp \
AArch64RedundantCopyElimination.cpp \
AArch64RegisterBankInfo.cpp \
AArch64RegisterInfo.cpp \
+ AArch64ReturnProtectorLowering.cpp \
AArch64SIMDInstrOpt.cpp \
AArch64SelectionDAGInfo.cpp \
AArch64StorePairSuppress.cpp \