src - OpenBSD base system

diff options


context:
space:
mode:

author	Patrick Wildt <patrick@cvs.openbsd.org>	2017-01-14 20:19:40 +0000
committer	Patrick Wildt <patrick@cvs.openbsd.org>	2017-01-14 20:19:40 +0000
commit	f444bdcbea9aa072c5d8199d4a7b1288a44b40ac (patch)
tree	6f60ced3b255dbfe9ef5abcc6f8ab372457023a9 /gnu/llvm
parent	0bd56e6caa13367b1386362329946770bc35feff (diff)

Disable the Load Stack Guard for OpenBSD on AArch64. We don't use it

on any other platform and it causes a segfault in combination with our IR Stack Guard. "looks reasonable" kettenis@ "looks good to me" stefan@

Diffstat (limited to 'gnu/llvm')

-rw-r--r--

gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

981

-rw-r--r--

gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h

123

2 files changed, 697 insertions, 407 deletions

diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 92cf1cd7197..13722c37c57 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -40,12 +40,6 @@ using namespace llvm;

STATISTIC(NumTailCalls, "Number of tail calls");

STATISTIC(NumShiftInserts, "Number of vector shift inserts");

-// Place holder until extr generation is tested fully.

-static cl::opt<bool>

-EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,

- cl::desc("Allow AArch64 (or (shift)(shift))->extract"),

- cl::init(true));

static cl::opt<bool>

EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,

cl::desc("Allow AArch64 SLI/SRI formation"),

@@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(

cl::desc("Allow AArch64 Local Dynamic TLS code generation"),

cl::init(false));

+// Disabled for causing self-hosting failures once returned-attribute inference

+// was enabled.

+static cl::opt<bool>

+EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,

+ cl::desc("Directly forward this return"),

+ cl::init(false));

/// Value type used for condition codes.

static const MVT MVT_CC = MVT::i32;

@@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

- // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero

- // counterparts, which AArch64 supports directly.

- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);

- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);

setOperationAction(ISD::CTPOP, MVT::i32, Custom);

setOperationAction(ISD::CTPOP, MVT::i64, Custom);

@@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);

// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.

// This requires the Performance Monitors extension.

if (Subtarget->hasPerfMon())

@@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

// Also, try to fold ADD into CSINC/CSINV..

setTargetDAGCombine(ISD::ADD);

setTargetDAGCombine(ISD::SUB);

+ setTargetDAGCombine(ISD::SRL);

setTargetDAGCombine(ISD::XOR);

setTargetDAGCombine(ISD::SINT_TO_FP);

setTargetDAGCombine(ISD::UINT_TO_FP);

@@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

MaskAndBranchFoldingIsLegal = true;

EnableExtLdPromotion = true;

+ // Set required alignment.

setMinFunctionAlignment(2);

+ // Set preferred alignments.

+ setPrefFunctionAlignment(STI.getPrefFunctionAlignment());

+ setPrefLoopAlignment(STI.getPrefLoopAlignment());

setHasExtractBitsInsn(true);

@@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

+ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);

+ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);

+ setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);

// AArch64 doesn't have MUL.2d:

setOperationAction(ISD::MUL, MVT::v2i64, Expand);

// Custom handling for some quad-vector types to detect MULL.

@@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

}

- // Prefer likely predicted branches to selects on out-of-order cores.

- if (Subtarget->isCortexA57())

- PredictableSelectIsExpensive = true;

+ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();

}

-void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {

+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {

if (VT == MVT::v2f32 || VT == MVT::v4f16) {

- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);

- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);

+ setOperationAction(ISD::LOAD, VT, Promote);

+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);

- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);

- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);

+ setOperationAction(ISD::STORE, VT, Promote);

+ AddPromotedToType(ISD::STORE, VT, MVT::v2i32);

} else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {

- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);

- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);

+ setOperationAction(ISD::LOAD, VT, Promote);

+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);

- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);

- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);

+ setOperationAction(ISD::STORE, VT, Promote);

+ AddPromotedToType(ISD::STORE, VT, MVT::v2i64);

}

// Mark vector float intrinsics as expand.

if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {

- setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);

+ setOperationAction(ISD::FSIN, VT, Expand);

+ setOperationAction(ISD::FCOS, VT, Expand);

+ setOperationAction(ISD::FPOWI, VT, Expand);

+ setOperationAction(ISD::FPOW, VT, Expand);

+ setOperationAction(ISD::FLOG, VT, Expand);

+ setOperationAction(ISD::FLOG2, VT, Expand);

+ setOperationAction(ISD::FLOG10, VT, Expand);

+ setOperationAction(ISD::FEXP, VT, Expand);

+ setOperationAction(ISD::FEXP2, VT, Expand);

// But we do support custom-lowering for FCOPYSIGN.

- setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);

- }

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);

- setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);

+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);

+ }

+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

+ setOperationAction(ISD::SRA, VT, Custom);

+ setOperationAction(ISD::SRL, VT, Custom);

+ setOperationAction(ISD::SHL, VT, Custom);

+ setOperationAction(ISD::AND, VT, Custom);

+ setOperationAction(ISD::OR, VT, Custom);

+ setOperationAction(ISD::SETCC, VT, Custom);

+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

+ setOperationAction(ISD::SELECT, VT, Expand);

+ setOperationAction(ISD::SELECT_CC, VT, Expand);

+ setOperationAction(ISD::VSELECT, VT, Expand);

for (MVT InnerVT : MVT::all_valuetypes())

- setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);

+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

// CNT supports only B element sizes.

if (VT != MVT::v8i8 && VT != MVT::v16i8)

- setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);

+ setOperationAction(ISD::CTPOP, VT, Expand);

- setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);

- setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);

+ setOperationAction(ISD::UDIV, VT, Expand);

+ setOperationAction(ISD::SDIV, VT, Expand);

+ setOperationAction(ISD::UREM, VT, Expand);

+ setOperationAction(ISD::SREM, VT, Expand);

+ setOperationAction(ISD::FREM, VT, Expand);

- setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);

- setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);

+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);

+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);

// [SU][MIN|MAX] are available for all NEON types apart from i64.

- if (!VT.isFloatingPoint() &&

- VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)

+ if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)

for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})

- setOperationAction(Opcode, VT.getSimpleVT(), Legal);

+ setOperationAction(Opcode, VT, Legal);

// F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).

if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)

for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,

ISD::FMINNUM, ISD::FMAXNUM})

- setOperationAction(Opcode, VT.getSimpleVT(), Legal);

+ setOperationAction(Opcode, VT, Legal);

if (Subtarget->isLittleEndian()) {

for (unsigned im = (unsigned)ISD::PRE_INC;

im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {

- setIndexedLoadAction(im, VT.getSimpleVT(), Legal);

- setIndexedStoreAction(im, VT.getSimpleVT(), Legal);

+ setIndexedLoadAction(im, VT, Legal);

+ setIndexedStoreAction(im, VT, Legal);

}

@@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

if (Subtarget->requiresStrictAlign())

return false;

- // FIXME: This is mostly true for Cyclone, but not necessarily others.

if (Fast) {

- // FIXME: Define an attribute for slow unaligned accesses instead of

- // relying on the CPU type as a proxy.

- // On Cyclone, unaligned 128-bit stores are slow.

- *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||

+ // Some CPUs are fine with unaligned stores except for 128-bit ones.

+ *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||

// See comments in performSTORECombine() for more details about

// these conditions.

@@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {

case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";

case AArch64ISD::SMULL: return "AArch64ISD::SMULL";

case AArch64ISD::UMULL: return "AArch64ISD::UMULL";

+ case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";

+ case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";

}

return nullptr;

}

MachineBasicBlock *

-AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,

+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,

MachineBasicBlock *MBB) const {

// We materialise the F128CSEL pseudo-instruction as some control flow and a

// phi node:

@@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,

MachineFunction *MF = MBB->getParent();

const TargetInstrInfo *TII = Subtarget->getInstrInfo();

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

- DebugLoc DL = MI->getDebugLoc();

+ DebugLoc DL = MI.getDebugLoc();

MachineFunction::iterator It = ++MBB->getIterator();

- unsigned DestReg = MI->getOperand(0).getReg();

- unsigned IfTrueReg = MI->getOperand(1).getReg();

- unsigned IfFalseReg = MI->getOperand(2).getReg();

- unsigned CondCode = MI->getOperand(3).getImm();

- bool NZCVKilled = MI->getOperand(4).isKill();

+ unsigned DestReg = MI.getOperand(0).getReg();

+ unsigned IfTrueReg = MI.getOperand(1).getReg();

+ unsigned IfFalseReg = MI.getOperand(2).getReg();

+ unsigned CondCode = MI.getOperand(3).getImm();

+ bool NZCVKilled = MI.getOperand(4).isKill();

MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);

MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);

@@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,

.addReg(IfFalseReg)

.addMBB(MBB);

- MI->eraseFromParent();

+ MI.eraseFromParent();

return EndBB;

}

-MachineBasicBlock *

-AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,

- MachineBasicBlock *BB) const {

- switch (MI->getOpcode()) {

+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(

+ MachineInstr &MI, MachineBasicBlock *BB) const {

+ switch (MI.getOpcode()) {

default:

#ifndef NDEBUG

- MI->dump();

+ MI.dump();

#endif

llvm_unreachable("Unexpected instruction for custom inserter!");

@@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC,

}

+/// Convert a DAG fp condition code to an AArch64 CC.

+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that

+/// should be AND'ed instead of OR'ed.

+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,

+ AArch64CC::CondCode &CondCode,

+ AArch64CC::CondCode &CondCode2) {

+ CondCode2 = AArch64CC::AL;

+ switch (CC) {

+ default:

+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);

+ assert(CondCode2 == AArch64CC::AL);

+ break;

+ case ISD::SETONE:

+ // (a one b)

+ // == ((a olt b) || (a ogt b))

+ // == ((a ord b) && (a une b))

+ CondCode = AArch64CC::VC;

+ CondCode2 = AArch64CC::NE;

+ break;

+ case ISD::SETUEQ:

+ // (a ueq b)

+ // == ((a uno b) || (a oeq b))

+ // == ((a ule b) && (a uge b))

+ CondCode = AArch64CC::PL;

+ CondCode2 = AArch64CC::LE;

+ break;

+ }

/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64

/// CC usable with the vector instructions. Fewer operations are available

/// without a real NZCV register, so we have to use less efficient combinations

@@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) {

}

static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,

- SDLoc dl, SelectionDAG &DAG) {

+ const SDLoc &dl, SelectionDAG &DAG) {

EVT VT = LHS.getValueType();

- if (VT.isFloatingPoint())

+ if (VT.isFloatingPoint()) {

+ assert(VT != MVT::f128);

+ if (VT == MVT::f16) {

+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);

+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);

+ VT = MVT::f32;

+ }

return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);

+ }

// The CMP instruction is just an alias for SUBS, and representing it as

// SUBS means that it's possible to get CSE with subtract operations.

@@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,

/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.

static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,

ISD::CondCode CC, SDValue CCOp,

- SDValue Condition, unsigned NZCV,

- SDLoc DL, SelectionDAG &DAG) {

+ AArch64CC::CondCode Predicate,

+ AArch64CC::CondCode OutCC,

+ const SDLoc &DL, SelectionDAG &DAG) {

unsigned Opcode = 0;

- if (LHS.getValueType().isFloatingPoint())

+ if (LHS.getValueType().isFloatingPoint()) {

+ assert(LHS.getValueType() != MVT::f128);

+ if (LHS.getValueType() == MVT::f16) {

+ LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);

+ RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);

+ }

Opcode = AArch64ISD::FCCMP;

- else if (RHS.getOpcode() == ISD::SUB) {

+ } else if (RHS.getOpcode() == ISD::SUB) {

SDValue SubOp0 = RHS.getOperand(0);

if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

- // See emitComparison() on why we can only do this for SETEQ and SETNE.

- Opcode = AArch64ISD::CCMN;

- RHS = RHS.getOperand(1);

- }

+ // See emitComparison() on why we can only do this for SETEQ and SETNE.

+ Opcode = AArch64ISD::CCMN;

+ RHS = RHS.getOperand(1);

+ }

}

if (Opcode == 0)

Opcode = AArch64ISD::CCMP;

+ SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);

+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);

+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);

SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);

return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);

}

@@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,

/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to

/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be

/// brought into such a form.

-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,

+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,

unsigned Depth = 0) {

if (!Val.hasOneUse())

return false;

unsigned Opcode = Val->getOpcode();

if (Opcode == ISD::SETCC) {

- CanPushNegate = true;

+ if (Val->getOperand(0).getValueType() == MVT::f128)

+ return false;

+ CanNegate = true;

return true;

}

- // Protect against stack overflow.

- if (Depth > 15)

+ // Protect against exponential runtime and stack overflow.

+ if (Depth > 6)

return false;

if (Opcode == ISD::AND || Opcode == ISD::OR) {

SDValue O0 = Val->getOperand(0);

SDValue O1 = Val->getOperand(1);

- bool CanPushNegateL;

- if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))

+ bool CanNegateL;

+ if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))

return false;

- bool CanPushNegateR;

- if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))

+ bool CanNegateR;

+ if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))

return false;

- // We cannot push a negate through an AND operation (it would become an OR),

- // we can however change a (not (or x y)) to (and (not x) (not y)) if we can

- // push the negate through the x/y subtrees.

- CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;

+ if (Opcode == ISD::OR) {

+ // For an OR expression we need to be able to negate at least one side or

+ // we cannot do the transformation at all.

+ if (!CanNegateL && !CanNegateR)

+ return false;

+ // We can however change a (not (or x y)) to (and (not x) (not y)) if we

+ // can negate the x and y subtrees.

+ CanNegate = CanNegateL && CanNegateR;

+ } else {

+ // If the operands are OR expressions then we finally need to negate their

+ // outputs, we can only do that for the operand with emitted last by

+ // negating OutCC, not for both operands.

+ bool NeedsNegOutL = O0->getOpcode() == ISD::OR;

+ bool NeedsNegOutR = O1->getOpcode() == ISD::OR;

+ if (NeedsNegOutL && NeedsNegOutR)

+ return false;

+ // We cannot negate an AND operation (it would become an OR),

+ CanNegate = false;

+ }

return true;

}

return false;

@@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,

/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate

/// for the comparisons in the current subtree; @p Depth limits the search

/// depth to avoid stack overflow.

-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,

- AArch64CC::CondCode &OutCC, bool PushNegate = false,

- SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,

- unsigned Depth = 0) {

+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,

+ AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,

+ AArch64CC::CondCode Predicate) {

// We're at a tree leaf, produce a conditional comparison operation.

unsigned Opcode = Val->getOpcode();

if (Opcode == ISD::SETCC) {

@@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,

SDValue RHS = Val->getOperand(1);

ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();

bool isInteger = LHS.getValueType().isInteger();

- if (PushNegate)

+ if (Negate)

CC = getSetCCInverse(CC, isInteger);

SDLoc DL(Val);

// Determine OutCC and handle FP special case.

@@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,

} else {

assert(LHS.getValueType().isFloatingPoint());

AArch64CC::CondCode ExtraCC;

- changeFPCCToAArch64CC(CC, OutCC, ExtraCC);

- // Surpisingly some floating point conditions can't be tested with a

- // single condition code. Construct an additional comparison in this case.

- // See comment below on how we deal with OR conditions.

+ changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);

+ // Some floating point conditions can't be tested with a single condition

+ // code. Construct an additional comparison in this case.

if (ExtraCC != AArch64CC::AL) {

SDValue ExtraCmp;

if (!CCOp.getNode())

ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);

- else {

- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);

- // Note that we want the inverse of ExtraCC, so NZCV is not inversed.

- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);

- ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,

- NZCV, DL, DAG);

- }

+ else

+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,

+ ExtraCC, DL, DAG);

CCOp = ExtraCmp;

- Predicate = AArch64CC::getInvertedCondCode(ExtraCC);

- OutCC = AArch64CC::getInvertedCondCode(OutCC);

+ Predicate = ExtraCC;

}

// Produce a normal comparison if we are first in the chain

- if (!CCOp.getNode())

+ if (!CCOp)

return emitComparison(LHS, RHS, CC, DL, DAG);

// Otherwise produce a ccmp.

- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);

- AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);

- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);

- return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,

+ return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,

DAG);

- } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())

- return SDValue();

- assert((Opcode == ISD::OR || !PushNegate)

- && "Can only push negate through OR operation");

+ }

+ assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&

+ "Valid conjunction/disjunction tree");

// Check if both sides can be transformed.

SDValue LHS = Val->getOperand(0);

SDValue RHS = Val->getOperand(1);

- bool CanPushNegateL;

- if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))

- return SDValue();

- bool CanPushNegateR;

- if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))

- return SDValue();

- // Do we need to negate our operands?

- bool NegateOperands = Opcode == ISD::OR;

+ // In case of an OR we need to negate our operands and the result.

+ // (A v B) <=> not(not(A) ^ not(B))

+ bool NegateOpsAndResult = Opcode == ISD::OR;

// We can negate the results of all previous operations by inverting the

- // predicate flags giving us a free negation for one side. For the other side

- // we need to be able to push the negation to the leafs of the tree.

- if (NegateOperands) {

- if (!CanPushNegateL && !CanPushNegateR)

- return SDValue();

- // Order the side where we can push the negate through to LHS.

- if (!CanPushNegateL && CanPushNegateR)

+ // predicate flags giving us a free negation for one side. The other side

+ // must be negatable by itself.

+ if (NegateOpsAndResult) {

+ // See which side we can negate.

+ bool CanNegateL;

+ bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);

+ assert(isValidL && "Valid conjunction/disjunction tree");

+ (void)isValidL;

+#ifndef NDEBUG

+ bool CanNegateR;

+ bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);

+ assert(isValidR && "Valid conjunction/disjunction tree");

+ assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");

+#endif

+ // Order the side which we cannot negate to RHS so we can emit it first.

+ if (!CanNegateL)

std::swap(LHS, RHS);

} else {

bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;

- bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;

- if (NeedsNegOutL && NeedsNegOutR)

- return SDValue();

+ assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&

+ "Valid conjunction/disjunction tree");

// Order the side where we need to negate the output flags to RHS so it

// gets emitted first.

if (NeedsNegOutL)

@@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,

// through if we are already in a PushNegate case, otherwise we can negate

// the "flags to test" afterwards.

AArch64CC::CondCode RHSCC;

- SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,

- CCOp, Predicate, Depth+1);

- if (NegateOperands && !PushNegate)

+ SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,

+ CCOp, Predicate);

+ if (NegateOpsAndResult && !Negate)

RHSCC = AArch64CC::getInvertedCondCode(RHSCC);

- // Emit LHS. We must push the negate through if we need to negate it.

- SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,

- CmpR, RHSCC, Depth+1);

+ // Emit LHS. We may need to negate it.

+ SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,

+ NegateOpsAndResult, CmpR,

+ RHSCC);

// If we transformed an OR to and AND then we have to negate the result

- // (or absorb a PushNegate resulting in a double negation).

- if (Opcode == ISD::OR && !PushNegate)

+ // (or absorb the Negate parameter).

+ if (NegateOpsAndResult && !Negate)

OutCC = AArch64CC::getInvertedCondCode(OutCC);

return CmpL;

}

+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain

+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.

+/// \see emitConjunctionDisjunctionTreeRec().

+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,

+ AArch64CC::CondCode &OutCC) {

+ bool CanNegate;

+ if (!isConjunctionDisjunctionTree(Val, CanNegate))

+ return SDValue();

+ return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),

+ AArch64CC::AL);

/// @}

static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,

- SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {

+ SDValue &AArch64cc, SelectionDAG &DAG,

+ const SDLoc &dl) {

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {

EVT VT = RHS.getValueType();

uint64_t C = RHSC->getZExtValue();

@@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,

StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);

TargetLowering::CallLoweringInfo CLI(DAG);

CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())

- .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);

+ .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

return CallResult.first;

@@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {

// The values are implicitly truncated so sext vs. zext doesn't matter.

Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));

}

- return DAG.getNode(ISD::BUILD_VECTOR, dl,

- MVT::getVectorVT(TruncVT, NumElts), Ops);

+ return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);

}

static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {

@@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

SDLoc dl(Op);

switch (IntNo) {

default: return SDValue(); // Don't custom lower most intrinsics.

- case Intrinsic::aarch64_thread_pointer: {

+ case Intrinsic::thread_pointer: {

EVT PtrVT = getPointerTy(DAG.getDataLayout());

return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);

}

@@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,

return CC_AArch64_GHC;

case CallingConv::C:

case CallingConv::Fast:

+ case CallingConv::PreserveMost:

+ case CallingConv::CXX_FAST_TLS:

if (!Subtarget->isTargetDarwin())

return CC_AArch64_AAPCS;

return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;

@@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,

SDValue AArch64TargetLowering::LowerFormalArguments(

SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,

- SmallVectorImpl<SDValue> &InVals) const {

+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,

+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

MachineFunction &MF = DAG.getMachineFunction();

MachineFrameInfo *MFI = MF.getFrameInfo();

@@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(

ArgValue = DAG.getExtLoad(

ExtType, DL, VA.getLocVT(), Chain, FIN,

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

- MemVT, false, false, false, 0);

+ MemVT);

InVals.push_back(ArgValue);

}

// varargs

+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

if (isVarArg) {

if (!Subtarget->isTargetDarwin()) {

// The AAPCS variadic function ABI is identical to the non-variadic

@@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments(

saveVarArgRegisters(CCInfo, DAG, DL, Chain);

}

- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

// This will point to the next argument passed via stack.

unsigned StackOffset = CCInfo.getNextStackOffset();

// We currently pass all varargs at 8-byte alignment.

StackOffset = ((StackOffset + 7) & ~7);

- AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));

+ FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));

}

- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

unsigned StackArgSize = CCInfo.getNextStackOffset();

bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;

if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {

// This is a non-standard ABI so by fiat I say we're allowed to make full

// use of the stack area to be popped, which must be aligned to 16 bytes in

// any case:

- StackArgSize = RoundUpToAlignment(StackArgSize, 16);

+ StackArgSize = alignTo(StackArgSize, 16);

// If we're expected to restore the stack (e.g. fastcc) then we'll be adding

// a multiple of 16.

@@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(

}

void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,

- SelectionDAG &DAG, SDLoc DL,

+ SelectionDAG &DAG,

+ const SDLoc &DL,

SDValue &Chain) const {

MachineFunction &MF = DAG.getMachineFunction();

MachineFrameInfo *MFI = MF.getFrameInfo();

@@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,

SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);

SDValue Store = DAG.getStore(

Val.getValue(1), DL, Val, FIN,

- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,

- false, 0);

+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));

MemOps.push_back(Store);

FIN =

DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));

@@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,

SDValue Store = DAG.getStore(

Val.getValue(1), DL, Val, FIN,

- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),

- false, false, 0);

+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));

MemOps.push_back(Store);

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,

DAG.getConstant(16, DL, PtrVT));

@@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,

/// appropriate copies out of appropriate physical registers.

SDValue AArch64TargetLowering::LowerCallResult(

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,

- SmallVectorImpl<SDValue> &InVals, bool isThisReturn,

+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,

+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,

SDValue ThisVal) const {

CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS

? RetCC_AArch64_WebKit_JS

@@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult(

// Pass 'this' value directly from the argument to return value, to avoid

// reg unit interference

- if (i == 0 && isThisReturn) {

+ if (i == 0 && isThisReturn && EnableThisRetForwarding) {

assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&

"unexpected return calling convention register assignment");

InVals.push_back(ThisVal);

@@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult(

bool AArch64TargetLowering::isEligibleForTailCallOptimization(

SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,

- bool isCalleeStructRet, bool isCallerStructRet,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

@@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)

return false;

- const MachineFunction &MF = DAG.getMachineFunction();

+ MachineFunction &MF = DAG.getMachineFunction();

const Function *CallerF = MF.getFunction();

CallingConv::ID CallerCC = CallerF->getCallingConv();

bool CCMatch = CallerCC == CalleeCC;

@@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

return false;

if (getTargetMachine().Options.GuaranteedTailCallOpt) {

- if (IsTailCallConvention(CalleeCC) && CCMatch)

- return true;

- return false;

+ return IsTailCallConvention(CalleeCC) && CCMatch;

}

// Externally-defined functions with weak linkage should not be

@@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

assert((!isVarArg || CalleeCC == CallingConv::C) &&

"Unexpected variadic calling convention");

+ LLVMContext &C = *DAG.getContext();

if (isVarArg && !Outs.empty()) {

// At least two cases here: if caller is fastcc then we can't have any

// memory arguments (we'd be expected to clean up the stack afterwards). If

@@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

// FIXME: for now we take the most conservative of these in both cases:

// disallow all variadic memory operands.

SmallVector<CCValAssign, 16> ArgLocs;

- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,

- *DAG.getContext());

+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));

for (const CCValAssign &ArgLoc : ArgLocs)

@@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

return false;

}

- // If the calling conventions do not match, then we'd better make sure the

- // results are returned in the same way as what the caller expects.

+ // Check that the call results are passed in the same way.

+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

+ CCAssignFnForCall(CalleeCC, isVarArg),

+ CCAssignFnForCall(CallerCC, isVarArg)))

+ return false;

+ // The callee has to preserve all registers the caller needs to preserve.

+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

if (!CCMatch) {

- SmallVector<CCValAssign, 16> RVLocs1;

- CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,

- *DAG.getContext());

- CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));

- SmallVector<CCValAssign, 16> RVLocs2;

- CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,

- *DAG.getContext());

- CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));

- if (RVLocs1.size() != RVLocs2.size())

+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

return false;

- for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {

- if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())

- return false;

- if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())

- return false;

- if (RVLocs1[i].isRegLoc()) {

- if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())

- return false;

- } else {

- if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())

- return false;

- }

}

// Nothing more to check if the callee is taking no arguments

@@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(

return true;

SmallVector<CCValAssign, 16> ArgLocs;

- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,

- *DAG.getContext());

+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

- // If the stack arguments for this call would fit into our own save area then

- // the call can be made tail.

- return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();

+ // If the stack arguments for this call do not fit into our own save area then

+ // the call cannot be made tail.

+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())

+ return false;

+ const MachineRegisterInfo &MRI = MF.getRegInfo();

+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

+ return false;

+ return true;

}

SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,

@@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,

}

bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {

- return CallCC == CallingConv::Fast;

+ return CallCC == CallingConv::Fast ||

+ CallCC == CallingConv::PreserveMost;

}

/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,

@@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

bool IsVarArg = CLI.IsVarArg;

MachineFunction &MF = DAG.getMachineFunction();

- bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();

bool IsThisReturn = false;

AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

@@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

if (IsTailCall) {

// Check if it's really possible to do a tail call.

IsTailCall = isEligibleForTailCallOptimization(

- Callee, CallConv, IsVarArg, IsStructRet,

- MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);

+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);

if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())

report_fatal_error("failed to perform tail call elimination on a call "

"site marked musttail");

@@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

// Since callee will pop argument stack as a tail call, we must keep the

// popped size 16-byte aligned.

- NumBytes = RoundUpToAlignment(NumBytes, 16);

+ NumBytes = alignTo(NumBytes, 16);

// FPDiff will be negative if this tail call requires more space than we

// would automatically have in our incoming argument space. Positive if we

@@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

VA.getValVT() == MVT::i16)

Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

- SDValue Store =

- DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);

+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);

MemOpChains.push_back(Store);

}

@@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);

InFlag = Chain.getValue(1);

- uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)

- ? RoundUpToAlignment(NumBytes, 16)

- : 0;

+ uint64_t CalleePopBytes =

+ DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),

DAG.getIntPtrConstant(CalleePopBytes, DL, true),

@@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

bool isVarArg,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

- SDLoc DL, SelectionDAG &DAG) const {

+ const SDLoc &DL, SelectionDAG &DAG) const {

CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS

? RetCC_AArch64_WebKit_JS

: RetCC_AArch64_AAPCS;

@@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,

return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);

}

- if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {

- assert(getTargetMachine().getCodeModel() == CodeModel::Small &&

- "use of MO_CONSTPOOL only supported on small model");

- SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);

- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);

- unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;

- SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);

- SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);

- SDValue GlobalAddr = DAG.getLoad(

- PtrVT, DL, DAG.getEntryNode(), PoolAddr,

- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

- /*isVolatile=*/false,

- /*isNonTemporal=*/true,

- /*isInvariant=*/true, 8);

- if (GN->getOffset() != 0)

- return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,

- DAG.getConstant(GN->getOffset(), DL, PtrVT));

- return GlobalAddr;

- }

if (getTargetMachine().getCodeModel() == CodeModel::Large) {

const unsigned char MO_NC = AArch64II::MO_NC;

return DAG.getNode(

@@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,

SDValue Chain = DAG.getEntryNode();

SDValue FuncTLVGet =

DAG.getLoad(MVT::i64, DL, Chain, DescAddr,

- MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,

- true, true, 8);

+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),

+ /* Alignment = */ 8, MachineMemOperand::MONonTemporal |

+ MachineMemOperand::MOInvariant);

Chain = FuncTLVGet.getValue(1);

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

@@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,

/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the

/// above sequence, and expanded really late in the compilation flow, to ensure

/// the sequence is produced as per above.

-SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,

+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,

+ const SDLoc &DL,

SelectionDAG &DAG) const {

EVT PtrVT = getPointerTy(DAG.getDataLayout());

SDValue Chain = DAG.getEntryNode();

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

- SmallVector<SDValue, 2> Ops;

- Ops.push_back(Chain);

- Ops.push_back(SymAddr);

- Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);

+ Chain =

+ DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});

SDValue Glue = Chain.getValue(1);

return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);

@@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,

SDValue RHS, SDValue TVal,

- SDValue FVal, SDLoc dl,

+ SDValue FVal, const SDLoc &dl,

SelectionDAG &DAG) const {

// Handle f128 first, because it will result in a comparison of some RTLIB

// call result against zero.

@@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,

getPointerTy(DAG.getDataLayout()));

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

- MachinePointerInfo(SV), false, false, 0);

+ MachinePointerInfo(SV));

}

SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,

@@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,

// void *__stack at offset 0

SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);

MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,

- MachinePointerInfo(SV), false, false, 8));

+ MachinePointerInfo(SV), /* Alignment = */ 8));

// void *__gr_top at offset 8

int GPRSize = FuncInfo->getVarArgsGPRSize();

@@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,

DAG.getConstant(GPRSize, DL, PtrVT));

MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,

- MachinePointerInfo(SV, 8), false, false, 8));

+ MachinePointerInfo(SV, 8),

+ /* Alignment = */ 8));

}

// void *__vr_top at offset 16

@@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,

DAG.getConstant(FPRSize, DL, PtrVT));

MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,

- MachinePointerInfo(SV, 16), false, false, 8));

+ MachinePointerInfo(SV, 16),

+ /* Alignment = */ 8));

}

// int __gr_offs at offset 24

SDValue GROffsAddr =

DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));

- MemOps.push_back(DAG.getStore(Chain, DL,

- DAG.getConstant(-GPRSize, DL, MVT::i32),

- GROffsAddr, MachinePointerInfo(SV, 24), false,

- false, 4));

+ MemOps.push_back(DAG.getStore(

+ Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,

+ MachinePointerInfo(SV, 24), /* Alignment = */ 4));

// int __vr_offs at offset 28

SDValue VROffsAddr =

DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));

- MemOps.push_back(DAG.getStore(Chain, DL,

- DAG.getConstant(-FPRSize, DL, MVT::i32),

- VROffsAddr, MachinePointerInfo(SV, 28), false,

- false, 4));

+ MemOps.push_back(DAG.getStore(

+ Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,

+ MachinePointerInfo(SV, 28), /* Alignment = */ 4));

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

}

@@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

unsigned Align = Op.getConstantOperandVal(3);

auto PtrVT = getPointerTy(DAG.getDataLayout());

- SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),

- false, false, false, 0);

+ SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));

Chain = VAList.getValue(1);

if (Align > 8) {

@@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

DAG.getConstant(ArgSize, DL, PtrVT));

// Store the incremented VAList to the legalized pointer

- SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),

- false, false, 0);

+ SDValue APStore =

+ DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

// Load the actual argument out of the pointer VAList

if (NeedFPTrunc) {

// Load the value as an f64.

- SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,

- MachinePointerInfo(), false, false, false, 0);

+ SDValue WideFP =

+ DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());

// Round the value down to an f32.

SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),

DAG.getIntPtrConstant(1, DL));

@@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

return DAG.getMergeValues(Ops, DL);

}

- return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,

- false, false, 0);

+ return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());

}

SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,

@@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,

DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);

while (Depth--)

FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,

- MachinePointerInfo(), false, false, false, 0);

+ MachinePointerInfo());

return FrameAddr;

}

@@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,

SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

return DAG.getLoad(VT, DL, DAG.getEntryNode(),

DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),

- MachinePointerInfo(), false, false, false, 0);

+ MachinePointerInfo());

}

// Return LR, which contains the return address. Mark it an implicit live-in.

@@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {

// AArch64 Optimization Hooks

//===----------------------------------------------------------------------===//

+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal

+/// or the reciprocal square root.

+static SDValue getEstimate(const AArch64Subtarget &ST,

+ const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,

+ const SDValue &Operand, unsigned &ExtraSteps) {

+ if (!ST.hasNEON())

+ return SDValue();

+ EVT VT = Operand.getValueType();

+ std::string RecipOp;

+ RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";

+ RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;

+ RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";

+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;

+ if (!Recips.isEnabled(RecipOp))

+ return SDValue();

+ ExtraSteps = Recips.getRefinementSteps(RecipOp);

+ return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);

+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,

+ DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {

+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);

+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,

+ DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {

+ UseOneConst = true;

+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);

//===----------------------------------------------------------------------===//

// AArch64 Inline Assembly Support

//===----------------------------------------------------------------------===//

@@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {

// is prefixed by the %w modifier. Floating-point and SIMD register operands

// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or

// %q modifier.

+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {

+ // At this point, we have to lower this constraint to something else, so we

+ // lower it to an "r" or "w". However, by doing this we will force the result

+ // to be in register, while the X constraint is much more permissive.

+ //

+ // Although we are correct (we are free to emit anything, without

+ // constraints), we might break use cases that would expect us to be more

+ // efficient and emit something else.

+ if (!Subtarget->hasFPARMv8())

+ return "r";

+ if (ConstraintVT.isFloatingPoint())

+ return "w";

+ if (ConstraintVT.isVector() &&

+ (ConstraintVT.getSizeInBits() == 64 ||

+ ConstraintVT.getSizeInBits() == 128))

+ return "w";

+ return "r";

/// getConstraintType - Given a constraint letter, return the type of

/// constraint it is for this target.

@@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(

int RegNo;

bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);

if (!Failed && RegNo >= 0 && RegNo <= 31) {

- // v0 - v31 are aliases of q0 - q31.

+ // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.

// By default we'll emit v0-v31 for this unless there's a modifier where

// we'll emit the correct register as well.

- Res.first = AArch64::FPR128RegClass.getRegister(RegNo);

- Res.second = &AArch64::FPR128RegClass;

+ if (VT != MVT::Other && VT.getSizeInBits() == 64) {

+ Res.first = AArch64::FPR64RegClass.getRegister(RegNo);

+ Res.second = &AArch64::FPR64RegClass;

+ } else {

+ Res.first = AArch64::FPR128RegClass.getRegister(RegNo);

+ Res.second = &AArch64::FPR128RegClass;

+ }

}

@@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

SmallVector<ShuffleSourceInfo, 2> Sources;

for (unsigned i = 0; i < NumElts; ++i) {

SDValue V = Op.getOperand(i);

- if (V.getOpcode() == ISD::UNDEF)

+ if (V.isUndef())

continue;

- else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {

+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

+ !isa<ConstantSDNode>(V.getOperand(1))) {

// A shuffle can only come from building a vector from various

- // elements of other vectors.

+ // elements of other vectors, provided their indices are constant.

return SDValue();

}

@@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();

for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {

SDValue Entry = Op.getOperand(i);

- if (Entry.getOpcode() == ISD::UNDEF)

+ if (Entry.isUndef())

continue;

auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));

@@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

ShuffleOps[i] = Sources[i].ShuffleVec;

SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],

- ShuffleOps[1], &Mask[0]);

+ ShuffleOps[1], Mask);

return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);

}

@@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {

/// the specified operations to build the shuffle.

static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,

SDValue RHS, SelectionDAG &DAG,

- SDLoc dl) {

+ const SDLoc &dl) {

unsigned OpNum = (PFEntry >> 26) & 0x0F;

unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);

unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

@@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,

SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

SDValue Shuffle;

- if (V2.getNode()->getOpcode() == ISD::UNDEF) {

+ if (V2.getNode()->isUndef()) {

if (IndexLen == 8)

V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);

Shuffle = DAG.getNode(

ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,

- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,

- makeArrayRef(TBLMask.data(), IndexLen)));

+ DAG.getBuildVector(IndexVT, DL,

+ makeArrayRef(TBLMask.data(), IndexLen)));

} else {

if (IndexLen == 8) {

V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);

Shuffle = DAG.getNode(

ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,

- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,

- makeArrayRef(TBLMask.data(), IndexLen)));

+ DAG.getBuildVector(IndexVT, DL,

+ makeArrayRef(TBLMask.data(), IndexLen)));

} else {

// FIXME: We cannot, for the moment, emit a TBL2 instruction because we

// cannot currently represent the register constraints on the input

// table registers.

// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,

- // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,

- // &TBLMask[0], IndexLen));

+ // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],

+ // IndexLen));

Shuffle = DAG.getNode(

ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

- DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),

- V1Cst, V2Cst,

- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,

- makeArrayRef(TBLMask.data(), IndexLen)));

+ DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,

+ V2Cst, DAG.getBuildVector(IndexVT, DL,

+ makeArrayRef(TBLMask.data(), IndexLen)));

}

return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);

@@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

SDValue V1 = Op.getOperand(0);

SDValue V2 = Op.getOperand(1);

- if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],

- V1.getValueType().getSimpleVT())) {

+ if (SVN->isSplat()) {

int Lane = SVN->getSplatIndex();

// If this is undef splat, generate it via "just" vdup, if possible.

if (Lane == -1)

@@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

Imm *= getExtFactor(V1);

return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,

DAG.getConstant(Imm, dl, MVT::i32));

- } else if (V2->getOpcode() == ISD::UNDEF &&

- isSingletonEXTMask(ShuffleMask, VT, Imm)) {

+ } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {

Imm *= getExtFactor(V1);

return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,

DAG.getConstant(Imm, dl, MVT::i32));

@@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);

}

- SDValue Concat = tryFormConcatFromShuffle(Op, DAG);

- if (Concat.getNode())

+ if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))

return Concat;

bool DstIsLeft;

@@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,

SelectionDAG &DAG) const {

// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))

if (EnableAArch64SlrGeneration) {

- SDValue Res = tryLowerToSLI(Op.getNode(), DAG);

- if (Res.getNode())

+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))

return Res;

}

@@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op,

}

Ops.push_back(Lane);

}

- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);

+ return DAG.getBuildVector(VT, dl, Ops);

}

SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,

@@ -6217,7 +6311,7 @@ FailedModImm:

SDValue ConstantValue;

for (unsigned i = 0; i < NumElts; ++i) {

SDValue V = Op.getOperand(i);

- if (V.getOpcode() == ISD::UNDEF)

+ if (V.isUndef())

continue;

if (i > 0)

isOnlyLowElement = false;

@@ -6273,7 +6367,7 @@ FailedModImm:

for (unsigned i = 0; i < NumElts; ++i)

Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);

- SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);

+ SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);

Val = LowerBUILD_VECTOR(Val, DAG);

if (Val.getNode())

return DAG.getNode(ISD::BITCAST, dl, VT, Val);

@@ -6328,7 +6422,7 @@ FailedModImm:

// value is already in an S or D register.

// Do not do this for UNDEF/LOAD nodes because we have better patterns

// for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.

- if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&

+ if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&

(ElemSize == 32 || ElemSize == 64)) {

unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;

MachineSDNode *N =

@@ -6339,7 +6433,7 @@ FailedModImm:

}

for (; i < NumElts; ++i) {

SDValue V = Op.getOperand(i);

- if (V.getOpcode() == ISD::UNDEF)

+ if (V.isUndef())

continue;

SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);

Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);

@@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,

static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,

AArch64CC::CondCode CC, bool NoNans, EVT VT,

- SDLoc dl, SelectionDAG &DAG) {

+ const SDLoc &dl, SelectionDAG &DAG) {

EVT SrcVT = LHS.getValueType();

assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&

"function only supposed to emit natural comparisons");

@@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {

const DataLayout &DL = I->getModule()->getDataLayout();

EVT VT = getValueType(DL, User->getOperand(0)->getType());

- if (isFMAFasterThanFMulAndFAdd(VT) &&

- isOperationLegalOrCustom(ISD::FMA, VT) &&

- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))

- return false;

- return true;

+ return !(isFMAFasterThanFMulAndFAdd(VT) &&

+ isOperationLegalOrCustom(ISD::FMA, VT) &&

+ (Options.AllowFPOpFusion == FPOpFusion::Fast ||

+ Options.UnsafeFPMath));

}

// All 32-bit GPR operations implicitly zero the high-half of the corresponding

@@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,

// 12-bit optionally shifted immediates are legal for adds.

bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {

- if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))

- return true;

- return false;

+ // Avoid UB for INT64_MIN.

+ if (Immed == std::numeric_limits<int64_t>::min())

+ return false;

+ // Same encoding for add/sub, just flip the sign.

+ Immed = std::abs(Immed);

+ return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));

}

// Integer comparisons are implemented with ADDS/SUBS, so the range of valid

// immediates is the same as for an add or a sub.

bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {

- if (Immed < 0)

- Immed *= -1;

return isLegalAddImmediate(Immed);

}

@@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,

// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

- if (!AM.Scale || AM.Scale == 1 ||

- (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))

- return true;

- return false;

+ return !AM.Scale || AM.Scale == 1 ||

+ (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);

}

int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,

@@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

return Shift < 3;

}

+/// Turn vector tests of the signbit in the form of:

+/// xor (sra X, elt_size(X)-1), -1

+/// into:

+/// cmge X, X, #0

+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

+ const AArch64Subtarget *Subtarget) {

+ EVT VT = N->getValueType(0);

+ if (!Subtarget->hasNEON() || !VT.isVector())

+ return SDValue();

+ // There must be a shift right algebraic before the xor, and the xor must be a

+ // 'not' operation.

+ SDValue Shift = N->getOperand(0);

+ SDValue Ones = N->getOperand(1);

+ if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||

+ !ISD::isBuildVectorAllOnes(Ones.getNode()))

+ return SDValue();

+ // The shift should be smearing the sign bit across each vector element.

+ auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));

+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();

+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)

+ return SDValue();

+ return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));

// Generate SUBS and CSEL for integer abs.

static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {

EVT VT = N->getValueType(0);

@@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {

return SDValue();

}

-// performXorCombine - Attempts to handle integer ABS.

static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,

TargetLowering::DAGCombinerInfo &DCI,

const AArch64Subtarget *Subtarget) {

if (DCI.isBeforeLegalizeOps())

return SDValue();

+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

+ return Cmp;

return performIntegerAbsCombine(N, DAG);

}

@@ -7376,6 +7496,10 @@ SDValue

AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

SelectionDAG &DAG,

std::vector<SDNode *> *Created) const {

+ AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();

+ if (isIntDivCheap(N->getValueType(0), Attr))

+ return SDValue(N,0); // Lower SDIV as SDIV

// fold (sdiv X, pow2)

EVT VT = N->getValueType(0);

if ((VT != MVT::i32 && VT != MVT::i64) ||

@@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,

// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and

// 64-bit is 5 cycles, so this is always a win.

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {

- APInt Value = C->getAPIntValue();

+ const APInt &Value = C->getAPIntValue();

EVT VT = N->getValueType(0);

SDLoc DL(N);

if (Value.isNonNegative()) {

@@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,

!cast<LoadSDNode>(N0)->isVolatile()) {

LoadSDNode *LN0 = cast<LoadSDNode>(N0);

SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),

- LN0->getPointerInfo(), LN0->isVolatile(),

- LN0->isNonTemporal(), LN0->isInvariant(),

- LN0->getAlignment());

+ LN0->getPointerInfo(), LN0->getAlignment(),

+ LN0->getMemOperand()->getFlags());

// Make sure successors of the original load stay after it by updating them

// to use the new Chain.

@@ -7562,12 +7685,14 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,

/// Fold a floating-point multiply by power of two into floating-point to

/// fixed-point conversion.

static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,

+ TargetLowering::DAGCombinerInfo &DCI,

const AArch64Subtarget *Subtarget) {

if (!Subtarget->hasNEON())

return SDValue();

SDValue Op = N->getOperand(0);

- if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)

+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||

+ Op.getOpcode() != ISD::FMUL)

return SDValue();

SDValue ConstVec = Op->getOperand(1);

@@ -7604,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,

ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;

break;

case 4:

- ResTy = MVT::v4i32;

+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;

break;

}

+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())

+ return SDValue();

+ assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&

+ "Illegal vector type after legalization");

SDLoc DL(N);

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs

@@ -7801,25 +7932,49 @@ static SDValue tryCombineToBSL(SDNode *N,

static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

const AArch64Subtarget *Subtarget) {

// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))

- if (!EnableAArch64ExtrGeneration)

- return SDValue();

SelectionDAG &DAG = DCI.DAG;

EVT VT = N->getValueType(0);

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

return SDValue();

- SDValue Res = tryCombineToEXTR(N, DCI);

- if (Res.getNode())

+ if (SDValue Res = tryCombineToEXTR(N, DCI))

return Res;

- Res = tryCombineToBSL(N, DCI);

- if (Res.getNode())

+ if (SDValue Res = tryCombineToBSL(N, DCI))

return Res;

return SDValue();

}

+static SDValue performSRLCombine(SDNode *N,

+ TargetLowering::DAGCombinerInfo &DCI) {

+ SelectionDAG &DAG = DCI.DAG;

+ EVT VT = N->getValueType(0);

+ if (VT != MVT::i32 && VT != MVT::i64)

+ return SDValue();

+ // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the

+ // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)

+ // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.

+ SDValue N0 = N->getOperand(0);

+ if (N0.getOpcode() == ISD::BSWAP) {

+ SDLoc DL(N);

+ SDValue N1 = N->getOperand(1);

+ SDValue N00 = N0.getOperand(0);

+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {

+ uint64_t ShiftAmt = C->getZExtValue();

+ if (VT == MVT::i32 && ShiftAmt == 16 &&

+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))

+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);

+ if (VT == MVT::i64 && ShiftAmt == 32 &&

+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))

+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);

+ }

+ return SDValue();

static SDValue performBitcastCombine(SDNode *N,

TargetLowering::DAGCombinerInfo &DCI,

SelectionDAG &DAG) {

@@ -8575,15 +8730,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {

SDValue BasePtr = St->getBasePtr();

SDValue NewST1 =

DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),

- St->isVolatile(), St->isNonTemporal(), St->getAlignment());

+ St->getAlignment(), St->getMemOperand()->getFlags());

unsigned Offset = EltOffset;

while (--NumVecElts) {

SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,

DAG.getConstant(Offset, DL, MVT::i64));

NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,

- St->getPointerInfo(), St->isVolatile(),

- St->isNonTemporal(), Alignment);

+ St->getPointerInfo(), Alignment,

+ St->getMemOperand()->getFlags());

Offset += EltOffset;

}

return NewST1;

@@ -8603,9 +8758,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be

// a call to that function here.

- // Cyclone has bad performance on unaligned 16B stores when crossing line and

- // page boundaries. We want to split such stores.

- if (!Subtarget->isCyclone())

+ if (!Subtarget->isMisaligned128StoreSlow())

return SDValue();

// Don't split at -Oz.

@@ -8647,12 +8800,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

SDValue BasePtr = S->getBasePtr();

SDValue NewST1 =

DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),

- S->isVolatile(), S->isNonTemporal(), S->getAlignment());

+ S->getAlignment(), S->getMemOperand()->getFlags());

SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,

DAG.getConstant(8, DL, MVT::i64));

return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,

- S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),

- S->getAlignment());

+ S->getPointerInfo(), S->getAlignment(),

+ S->getMemOperand()->getFlags());

}

/// Target-specific DAG combine function for post-increment LD1 (lane) and

@@ -8741,9 +8894,10 @@ static SDValue performPostLD1Combine(SDNode *N,

LoadSDN->getMemOperand());

// Update the uses.

- SmallVector<SDValue, 2> NewResults;

- NewResults.push_back(SDValue(LD, 0)); // The result of load

- NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain

+ SDValue NewResults[] = {

+ SDValue(LD, 0), // The result of load

+ SDValue(UpdN.getNode(), 2) // Chain

+ };

DCI.CombineTo(LD, NewResults);

DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result

DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

@@ -8774,8 +8928,7 @@ static SDValue performSTORECombine(SDNode *N,

TargetLowering::DAGCombinerInfo &DCI,

SelectionDAG &DAG,

const AArch64Subtarget *Subtarget) {

- SDValue Split = split16BStores(N, DCI, DAG, Subtarget);

- if (Split.getNode())

+ if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))

return Split;

if (Subtarget->supportsAddressTopByteIgnored() &&

@@ -9215,10 +9368,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {

}

case ISD::Constant:

case ISD::TargetConstant: {

- if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <

- 1LL << (width - 1))

- return true;

- return false;

+ return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <

+ 1LL << (width - 1);

}

@@ -9286,14 +9437,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {

// isEquivalentMaskless() is the code for testing if the AND can be removed

// factored out of the DAG recognition as the DAG can take several forms.

-static

-bool isEquivalentMaskless(unsigned CC, unsigned width,

- ISD::LoadExtType ExtType, signed AddConstant,

- signed CompConstant) {

+static bool isEquivalentMaskless(unsigned CC, unsigned width,

+ ISD::LoadExtType ExtType, int AddConstant,

+ int CompConstant) {

// By being careful about our equations and only writing the in term

// symbolic values and well known constants (0, 1, -1, MaxUInt) we can

// make them generally applicable to all bit widths.

- signed MaxUInt = (1 << width);

+ int MaxUInt = (1 << width);

// For the purposes of these comparisons sign extending the type is

// equivalent to zero extending the add and displacing it by half the integer

@@ -9441,8 +9591,7 @@ SDValue performCONDCombine(SDNode *N,

static SDValue performBRCONDCombine(SDNode *N,

TargetLowering::DAGCombinerInfo &DCI,

SelectionDAG &DAG) {

- SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);

- if (NV.getNode())

+ if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))

N = NV.getNode();

SDValue Chain = N->getOperand(0);

SDValue Dest = N->getOperand(1);

@@ -9678,7 +9827,7 @@ static SDValue performSelectCombine(SDNode *N,

// Now duplicate the comparison mask we want across all other lanes.

SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);

- SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());

+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);

Mask = DAG.getNode(ISD::BITCAST, DL,

ResVT.changeVectorElementTypeToInteger(), Mask);

@@ -9711,11 +9860,13 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,

return performIntToFpCombine(N, DAG, Subtarget);

case ISD::FP_TO_SINT:

case ISD::FP_TO_UINT:

- return performFpToIntCombine(N, DAG, Subtarget);

+ return performFpToIntCombine(N, DAG, DCI, Subtarget);

case ISD::FDIV:

return performFDivCombine(N, DAG, Subtarget);

case ISD::OR:

return performORCombine(N, DCI, Subtarget);

+ case ISD::SRL:

+ return performSRLCombine(N, DCI);

case ISD::INTRINSIC_WO_CHAIN:

return performIntrinsicCombine(N, DCI, Subtarget);

case ISD::ANY_EXTEND:

@@ -9829,10 +9980,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,

// return instructions to help enable tail call optimizations for this

// instruction.

bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {

- if (!CI->isTailCall())

- return false;

- return true;

+ return CI->isTailCall();

}

bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,

@@ -9935,6 +10083,38 @@ static void ReplaceReductionResults(SDNode *N,

Results.push_back(SplitVal);

}

+static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {

+ SDLoc DL(N);

+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);

+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,

+ DAG.getNode(ISD::SRL, DL, MVT::i128, N,

+ DAG.getConstant(64, DL, MVT::i64)));

+ return std::make_pair(Lo, Hi);

+static void ReplaceCMP_SWAP_128Results(SDNode *N,

+ SmallVectorImpl<SDValue> & Results,

+ SelectionDAG &DAG) {

+ assert(N->getValueType(0) == MVT::i128 &&

+ "AtomicCmpSwap on types less than 128 should be legal");

+ auto Desired = splitInt128(N->getOperand(2), DAG);

+ auto New = splitInt128(N->getOperand(3), DAG);

+ SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,

+ New.first, New.second, N->getOperand(0)};

+ SDNode *CmpSwap = DAG.getMachineNode(

+ AArch64::CMP_SWAP_128, SDLoc(N),

+ DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

+ MachineFunction &MF = DAG.getMachineFunction();

+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);

+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();

+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);

+ Results.push_back(SDValue(CmpSwap, 0));

+ Results.push_back(SDValue(CmpSwap, 1));

+ Results.push_back(SDValue(CmpSwap, 3));

void AArch64TargetLowering::ReplaceNodeResults(

SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

switch (N->getOpcode()) {

@@ -9966,11 +10146,16 @@ void AArch64TargetLowering::ReplaceNodeResults(

assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");

// Let normal code take care of it by not adding anything to Results.

return;

+ case ISD::ATOMIC_CMP_SWAP:

+ ReplaceCMP_SWAP_128Results(N, Results, DAG);

+ return;

}

bool AArch64TargetLowering::useLoadStackGuardNode() const {

- return true;

+ if (!Subtarget->isTargetAndroid() && !Subtarget->isTargetOpenBSD())

+ return true;

+ return TargetLowering::useLoadStackGuardNode();

}

unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {

@@ -10017,14 +10202,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(

AtomicCmpXchgInst *AI) const {

- return true;

+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to

+ // implement cmpxchg without spilling. If the address being exchanged is also

+ // on the stack and close enough to the spill slot, this can lead to a

+ // situation where the monitor always gets cleared and the atomic operation

+ // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.

+ return getTargetMachine().getOptLevel() != 0;

}

Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,

AtomicOrdering Ord) const {

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();

- bool IsAcquire = isAtLeastAcquire(Ord);

+ bool IsAcquire = isAcquireOrStronger(Ord);

// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd

// intrinsic must return {i64, i64} and we have to recombine them into a

@@ -10066,7 +10256,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,

Value *Val, Value *Addr,

AtomicOrdering Ord) const {

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

- bool IsRelease = isAtLeastRelease(Ord);

+ bool IsRelease = isReleaseOrStronger(Ord);

// Since the intrinsics must have legal type, the i128 intrinsics take two

// parameters: "i64, i64". We must marshal Val into the appropriate form

@@ -10104,6 +10294,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,

return false;

}

+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {

+ if (!Subtarget->isTargetAndroid())

+ return TargetLowering::getIRStackGuard(IRB);

+ // Android provides a fixed TLS slot for the stack cookie. See the definition

+ // of TLS_SLOT_STACK_GUARD in

+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

+ const unsigned TlsOffset = 0x28;

+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();

+ Function *ThreadPointerFunc =

+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);

+ return IRB.CreatePointerCast(

+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),

+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));

Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {

if (!Subtarget->isTargetAndroid())

return TargetLowering::getSafeStackPointerLocation(IRB);

@@ -10114,7 +10320,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons

const unsigned TlsOffset = 0x48;

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

Function *ThreadPointerFunc =

- Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);

+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);

return IRB.CreatePointerCast(

IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));

@@ -10166,3 +10372,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR(

.addReg(NewVR);

}

+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {

+ // Integer division on AArch64 is expensive. However, when aggressively

+ // optimizing for code size, we prefer to use a div instruction, as it is

+ // usually smaller than the alternative sequence.

+ // The exception to this is vector division. Since AArch64 doesn't have vector

+ // integer division, leaving the division as-is is a loss even in terms of

+ // size, because it will have to be scalarized, while the alternative code

+ // sequence can be performed in vector form.

+ bool OptSize =

+ Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);

+ return OptSize && !VT.isVector();

diff --git a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 151133b2f32..91905b2c383 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h

@@ -19,6 +19,7 @@

#include "AArch64InstrInfo.h"

#include "AArch64RegisterInfo.h"

#include "AArch64SelectionDAGInfo.h"

+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/Target/TargetSubtargetInfo.h"

#include <string>

@@ -32,38 +33,64 @@ class StringRef;

class Triple;

class AArch64Subtarget : public AArch64GenSubtargetInfo {

-protected:

- enum ARMProcFamilyEnum {

+public:

+ enum ARMProcFamilyEnum : uint8_t {

Others,

CortexA35,

CortexA53,

CortexA57,

+ CortexA72,

+ CortexA73,

Cyclone,

- ExynosM1

+ ExynosM1,

+ Kryo,

+ Vulcan

};

+protected:

/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.

- ARMProcFamilyEnum ARMProcFamily;

+ ARMProcFamilyEnum ARMProcFamily = Others;

- bool HasV8_1aOps;

- bool HasV8_2aOps;

+ bool HasV8_1aOps = false;

+ bool HasV8_2aOps = false;

- bool HasFPARMv8;

- bool HasNEON;

- bool HasCrypto;

- bool HasCRC;

- bool HasPerfMon;

- bool HasFullFP16;

- bool HasSPE;

+ bool HasFPARMv8 = false;

+ bool HasNEON = false;

+ bool HasCrypto = false;

+ bool HasCRC = false;

+ bool HasRAS = false;

+ bool HasPerfMon = false;

+ bool HasFullFP16 = false;

+ bool HasSPE = false;

// HasZeroCycleRegMove - Has zero-cycle register mov instructions.

- bool HasZeroCycleRegMove;

+ bool HasZeroCycleRegMove = false;

// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.

- bool HasZeroCycleZeroing;

+ bool HasZeroCycleZeroing = false;

// StrictAlign - Disallow unaligned memory accesses.

- bool StrictAlign;

+ bool StrictAlign = false;

+ bool MergeNarrowLoads = false;

+ bool UseAA = false;

+ bool PredictableSelectIsExpensive = false;

+ bool BalanceFPOps = false;

+ bool CustomAsCheapAsMove = false;

+ bool UsePostRAScheduler = false;

+ bool Misaligned128StoreIsSlow = false;

+ bool AvoidQuadLdStPairs = false;

+ bool UseAlternateSExtLoadCVTF32Pattern = false;

+ bool HasMacroOpFusion = false;

+ bool DisableLatencySchedHeuristic = false;

+ bool UseRSqrt = false;

+ uint8_t MaxInterleaveFactor = 2;

+ uint8_t VectorInsertExtractBaseCost = 3;

+ uint16_t CacheLineSize = 0;

+ uint16_t PrefetchDistance = 0;

+ uint16_t MinPrefetchStride = 1;

+ unsigned MaxPrefetchIterationsAhead = UINT_MAX;

+ unsigned PrefFunctionAlignment = 0;

+ unsigned PrefLoopAlignment = 0;

// ReserveX18 - X18 is not available as a general purpose register.

bool ReserveX18;

@@ -80,12 +107,20 @@ protected:

AArch64InstrInfo InstrInfo;

AArch64SelectionDAGInfo TSInfo;

AArch64TargetLowering TLInfo;

+ /// Gather the accessor points to GlobalISel-related APIs.

+ /// This is used to avoid ifndefs spreading around while GISel is

+ /// an optional library.

+ std::unique_ptr<GISelAccessor> GISel;

private:

/// initializeSubtargetDependencies - Initializes using CPUString and the

/// passed in feature string so that we can use initializer lists for

/// subtarget initialization.

AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);

+ /// Initialize properties based on the selected processor family.

+ void initializeProperties();

public:

/// This constructor initializes the data members to match that

/// of the specified triple.

@@ -93,6 +128,11 @@ public:

const std::string &FS, const TargetMachine &TM,

bool LittleEndian);

+ /// This object will take onwership of \p GISelAccessor.

+ void setGISelAccessor(GISelAccessor &GISel) {

+ this->GISel.reset(&GISel);

+ }

const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {

return &TSInfo;

}

@@ -106,10 +146,20 @@ public:

const AArch64RegisterInfo *getRegisterInfo() const override {

return &getInstrInfo()->getRegisterInfo();

}

+ const CallLowering *getCallLowering() const override;

+ const RegisterBankInfo *getRegBankInfo() const override;

const Triple &getTargetTriple() const { return TargetTriple; }

bool enableMachineScheduler() const override { return true; }

bool enablePostRAScheduler() const override {

- return isGeneric() || isCortexA53() || isCortexA57();

+ return UsePostRAScheduler;

+ }

+ /// Returns ARM processor family.

+ /// Avoid this function! CPU specifics should be kept local to this class

+ /// and preferably modeled with SubtargetFeatures or properties in

+ /// initializeProperties().

+ ARMProcFamilyEnum getProcFamily() const {

+ return ARMProcFamily;

}

bool hasV8_1aOps() const { return HasV8_1aOps; }

@@ -126,6 +176,33 @@ public:

bool hasNEON() const { return HasNEON; }

bool hasCrypto() const { return HasCrypto; }

bool hasCRC() const { return HasCRC; }

+ bool hasRAS() const { return HasRAS; }

+ bool mergeNarrowLoads() const { return MergeNarrowLoads; }

+ bool balanceFPOps() const { return BalanceFPOps; }

+ bool predictableSelectIsExpensive() const {

+ return PredictableSelectIsExpensive;

+ }

+ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }

+ bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }

+ bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }

+ bool useAlternateSExtLoadCVTF32Pattern() const {

+ return UseAlternateSExtLoadCVTF32Pattern;

+ }

+ bool hasMacroOpFusion() const { return HasMacroOpFusion; }

+ bool useRSqrt() const { return UseRSqrt; }

+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }

+ unsigned getVectorInsertExtractBaseCost() const {

+ return VectorInsertExtractBaseCost;

+ }

+ unsigned getCacheLineSize() const { return CacheLineSize; }

+ unsigned getPrefetchDistance() const { return PrefetchDistance; }

+ unsigned getMinPrefetchStride() const { return MinPrefetchStride; }

+ unsigned getMaxPrefetchIterationsAhead() const {

+ return MaxPrefetchIterationsAhead;

+ }

+ unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }

+ unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }

/// CPU has TBI (top byte of addresses is ignored during HW address

/// translation) and OS enables it.

bool supportsAddressTopByteIgnored() const;

@@ -139,6 +216,7 @@ public:

bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }

bool isTargetIOS() const { return TargetTriple.isiOS(); }

bool isTargetLinux() const { return TargetTriple.isOSLinux(); }

+ bool isTargetOpenBSD() const { return TargetTriple.isOSOpenBSD(); }

bool isTargetWindows() const { return TargetTriple.isOSWindows(); }

bool isTargetAndroid() const { return TargetTriple.isAndroid(); }

@@ -146,13 +224,7 @@ public:

bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }

bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }

- bool isGeneric() const { return CPUString == "generic"; }

- bool isCyclone() const { return CPUString == "cyclone"; }

- bool isCortexA57() const { return CPUString == "cortex-a57"; }

- bool isCortexA53() const { return CPUString == "cortex-a53"; }

- bool isExynosM1() const { return CPUString == "exynos-m1"; }

- bool useAA() const override { return isCortexA53(); }

+ bool useAA() const override { return UseAA; }

/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size

/// that still makes it profitable to inline the call.

@@ -174,8 +246,7 @@ public:

/// returns null.

const char *getBZeroEntry() const;

- void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,

- MachineInstr *end,

+ void overrideSchedPolicy(MachineSchedPolicy &Policy,

unsigned NumRegionInstrs) const override;

bool enableEarlyIfConversion() const override;