summaryrefslogtreecommitdiff
path: root/gnu/llvm
diff options
context:
space:
mode:
authorPatrick Wildt <patrick@cvs.openbsd.org>2017-01-14 20:19:40 +0000
committerPatrick Wildt <patrick@cvs.openbsd.org>2017-01-14 20:19:40 +0000
commitf444bdcbea9aa072c5d8199d4a7b1288a44b40ac (patch)
tree6f60ced3b255dbfe9ef5abcc6f8ab372457023a9 /gnu/llvm
parent0bd56e6caa13367b1386362329946770bc35feff (diff)
Disable the Load Stack Guard for OpenBSD on AArch64. We don't use it
on any other platform and it causes a segfault in combination with our IR Stack Guard. "looks reasonable" kettenis@ "looks good to me" stefan@
Diffstat (limited to 'gnu/llvm')
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp981
-rw-r--r--gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h123
2 files changed, 697 insertions, 407 deletions
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 92cf1cd7197..13722c37c57 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,12 +40,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
- cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
- cl::init(true));
-
static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
cl::desc("Allow AArch64 SLI/SRI formation"),
@@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,
+ cl::desc("Directly forward this return"),
+ cl::init(false));
+
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
@@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
- // counterparts, which AArch64 supports directly.
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
@@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Also, try to fold ADD into CSINC/CSINV..
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
-
+ setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MaskAndBranchFoldingIsLegal = true;
EnableExtLdPromotion = true;
+ // Set required alignment.
setMinFunctionAlignment(2);
+ // Set preferred alignments.
+ setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+ setPrefLoopAlignment(STI.getPrefLoopAlignment());
setHasExtractBitsInsn(true);
@@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);
+
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
@@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- // Prefer likely predicted branches to selects on out-of-order cores.
- if (Subtarget->isCortexA57())
- PredictableSelectIsExpensive = true;
+ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
-void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
if (VT == MVT::v2f32 || VT == MVT::v4f16) {
- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
} else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+ setOperationAction(ISD::STORE, VT, Promote);
+ AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
}
// Mark vector float intrinsics as expand.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
- setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
// But we do support custom-lowering for FCOPYSIGN.
- setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
- }
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
- setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
for (MVT InnerVT : MVT::all_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// CNT supports only B element sizes.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
- setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
- setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
- setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
- setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
// [SU][MIN|MAX] are available for all NEON types apart from i64.
- if (!VT.isFloatingPoint() &&
- VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+ if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
- setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ setOperationAction(Opcode, VT, Legal);
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
ISD::FMINNUM, ISD::FMAXNUM})
- setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ setOperationAction(Opcode, VT, Legal);
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
- setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
- setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
}
}
}
@@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
if (Subtarget->requiresStrictAlign())
return false;
- // FIXME: This is mostly true for Cyclone, but not necessarily others.
if (Fast) {
- // FIXME: Define an attribute for slow unaligned accesses instead of
- // relying on the CPU type as a proxy.
- // On Cyclone, unaligned 128-bit stores are slow.
- *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.
+ *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
@@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
+ case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
+ case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
}
return nullptr;
}
MachineBasicBlock *
-AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a
// phi node:
@@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
- unsigned DestReg = MI->getOperand(0).getReg();
- unsigned IfTrueReg = MI->getOperand(1).getReg();
- unsigned IfFalseReg = MI->getOperand(2).getReg();
- unsigned CondCode = MI->getOperand(3).getImm();
- bool NZCVKilled = MI->getOperand(4).isKill();
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned IfTrueReg = MI.getOperand(1).getReg();
+ unsigned IfFalseReg = MI.getOperand(2).getReg();
+ unsigned CondCode = MI.getOperand(3).getImm();
+ bool NZCVKilled = MI.getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
.addReg(IfFalseReg)
.addMBB(MBB);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return EndBB;
}
-MachineBasicBlock *
-AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock *BB) const {
- switch (MI->getOpcode()) {
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
default:
#ifndef NDEBUG
- MI->dump();
+ MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
@@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC,
}
}
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+ assert(CondCode2 == AArch64CC::AL);
+ break;
+ case ISD::SETONE:
+ // (a one b)
+ // == ((a olt b) || (a ogt b))
+ // == ((a ord b) && (a une b))
+ CondCode = AArch64CC::VC;
+ CondCode2 = AArch64CC::NE;
+ break;
+ case ISD::SETUEQ:
+ // (a ueq b)
+ // == ((a uno b) || (a oeq b))
+ // == ((a ule b) && (a uge b))
+ CondCode = AArch64CC::PL;
+ CondCode2 = AArch64CC::LE;
+ break;
+ }
+}
+
/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
/// CC usable with the vector instructions. Fewer operations are available
/// without a real NZCV register, so we have to use less efficient combinations
@@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) {
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDLoc dl, SelectionDAG &DAG) {
+ const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
- if (VT.isFloatingPoint())
+ if (VT.isFloatingPoint()) {
+ assert(VT != MVT::f128);
+ if (VT == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ VT = MVT::f32;
+ }
return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+ }
// The CMP instruction is just an alias for SUBS, and representing it as
// SUBS means that it's possible to get CSE with subtract operations.
@@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
- SDValue Condition, unsigned NZCV,
- SDLoc DL, SelectionDAG &DAG) {
+ AArch64CC::CondCode Predicate,
+ AArch64CC::CondCode OutCC,
+ const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
- if (LHS.getValueType().isFloatingPoint())
+ if (LHS.getValueType().isFloatingPoint()) {
+ assert(LHS.getValueType() != MVT::f128);
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+ }
Opcode = AArch64ISD::FCCMP;
- else if (RHS.getOpcode() == ISD::SUB) {
+ } else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- // See emitComparison() on why we can only do this for SETEQ and SETNE.
- Opcode = AArch64ISD::CCMN;
- RHS = RHS.getOperand(1);
- }
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
+ SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
}
@@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
/// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
unsigned Depth = 0) {
if (!Val.hasOneUse())
return false;
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
- CanPushNegate = true;
+ if (Val->getOperand(0).getValueType() == MVT::f128)
+ return false;
+ CanNegate = true;
return true;
}
- // Protect against stack overflow.
- if (Depth > 15)
+ // Protect against exponential runtime and stack overflow.
+ if (Depth > 6)
return false;
if (Opcode == ISD::AND || Opcode == ISD::OR) {
SDValue O0 = Val->getOperand(0);
SDValue O1 = Val->getOperand(1);
- bool CanPushNegateL;
- if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+ bool CanNegateL;
+ if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
return false;
- bool CanPushNegateR;
- if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+ bool CanNegateR;
+ if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
return false;
- // We cannot push a negate through an AND operation (it would become an OR),
- // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
- // push the negate through the x/y subtrees.
- CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+
+ if (Opcode == ISD::OR) {
+ // For an OR expression we need to be able to negate at least one side or
+ // we cannot do the transformation at all.
+ if (!CanNegateL && !CanNegateR)
+ return false;
+ // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+ // can negate the x and y subtrees.
+ CanNegate = CanNegateL && CanNegateR;
+ } else {
+ // If the operands are OR expressions then we finally need to negate their
+ // outputs, we can only do that for the operand with emitted last by
+ // negating OutCC, not for both operands.
+ bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+ bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+ if (NeedsNegOutL && NeedsNegOutR)
+ return false;
+ // We cannot negate an AND operation (it would become an OR),
+ CanNegate = false;
+ }
return true;
}
return false;
@@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
/// for the comparisons in the current subtree; @p Depth limits the search
/// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
- AArch64CC::CondCode &OutCC, bool PushNegate = false,
- SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
- unsigned Depth = 0) {
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+ AArch64CC::CondCode Predicate) {
// We're at a tree leaf, produce a conditional comparison operation.
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
@@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
SDValue RHS = Val->getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
- if (PushNegate)
+ if (Negate)
CC = getSetCCInverse(CC, isInteger);
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
@@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
- changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
- // Surpisingly some floating point conditions can't be tested with a
- // single condition code. Construct an additional comparison in this case.
- // See comment below on how we deal with OR conditions.
+ changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+ // Some floating point conditions can't be tested with a single condition
+ // code. Construct an additional comparison in this case.
if (ExtraCC != AArch64CC::AL) {
SDValue ExtraCmp;
if (!CCOp.getNode())
ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
- else {
- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
- // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
- ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
- NZCV, DL, DAG);
- }
+ else
+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+ ExtraCC, DL, DAG);
CCOp = ExtraCmp;
- Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
- OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ Predicate = ExtraCC;
}
}
// Produce a normal comparison if we are first in the chain
- if (!CCOp.getNode())
+ if (!CCOp)
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
- SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
- AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
- unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
- return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+ return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
DAG);
- } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
- return SDValue();
-
- assert((Opcode == ISD::OR || !PushNegate)
- && "Can only push negate through OR operation");
+ }
+ assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+ "Valid conjunction/disjunction tree");
// Check if both sides can be transformed.
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
- bool CanPushNegateL;
- if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
- return SDValue();
- bool CanPushNegateR;
- if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
- return SDValue();
- // Do we need to negate our operands?
- bool NegateOperands = Opcode == ISD::OR;
+ // In case of an OR we need to negate our operands and the result.
+ // (A v B) <=> not(not(A) ^ not(B))
+ bool NegateOpsAndResult = Opcode == ISD::OR;
// We can negate the results of all previous operations by inverting the
- // predicate flags giving us a free negation for one side. For the other side
- // we need to be able to push the negation to the leafs of the tree.
- if (NegateOperands) {
- if (!CanPushNegateL && !CanPushNegateR)
- return SDValue();
- // Order the side where we can push the negate through to LHS.
- if (!CanPushNegateL && CanPushNegateR)
+ // predicate flags giving us a free negation for one side. The other side
+ // must be negatable by itself.
+ if (NegateOpsAndResult) {
+ // See which side we can negate.
+ bool CanNegateL;
+ bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+ assert(isValidL && "Valid conjunction/disjunction tree");
+ (void)isValidL;
+
+#ifndef NDEBUG
+ bool CanNegateR;
+ bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+ assert(isValidR && "Valid conjunction/disjunction tree");
+ assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+ // Order the side which we cannot negate to RHS so we can emit it first.
+ if (!CanNegateL)
std::swap(LHS, RHS);
} else {
bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
- bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
- if (NeedsNegOutL && NeedsNegOutR)
- return SDValue();
+ assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+ "Valid conjunction/disjunction tree");
// Order the side where we need to negate the output flags to RHS so it
// gets emitted first.
if (NeedsNegOutL)
@@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
// through if we are already in a PushNegate case, otherwise we can negate
// the "flags to test" afterwards.
AArch64CC::CondCode RHSCC;
- SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
- CCOp, Predicate, Depth+1);
- if (NegateOperands && !PushNegate)
+ SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+ CCOp, Predicate);
+ if (NegateOpsAndResult && !Negate)
RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
- // Emit LHS. We must push the negate through if we need to negate it.
- SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
- CmpR, RHSCC, Depth+1);
+ // Emit LHS. We may need to negate it.
+ SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+ NegateOpsAndResult, CmpR,
+ RHSCC);
// If we transformed an OR to and AND then we have to negate the result
- // (or absorb a PushNegate resulting in a double negation).
- if (Opcode == ISD::OR && !PushNegate)
+ // (or absorb the Negate parameter).
+ if (NegateOpsAndResult && !Negate)
OutCC = AArch64CC::getInvertedCondCode(OutCC);
return CmpL;
}
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC) {
+ bool CanNegate;
+ if (!isConjunctionDisjunctionTree(Val, CanNegate))
+ return SDValue();
+
+ return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+ AArch64CC::AL);
+}
+
/// @}
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+ SDValue &AArch64cc, SelectionDAG &DAG,
+ const SDLoc &dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
@@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
+ .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
@@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::getVectorVT(TruncVT, NumElts), Ops);
+ return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
}
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
@@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::aarch64_thread_pointer: {
+ case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
}
@@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::PreserveMost:
+ case CallingConv::CXX_FAST_TLS:
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
@@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
ArgValue = DAG.getExtLoad(
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
- MemVT, false, false, false, 0);
+ MemVT);
InVals.push_back(ArgValue);
}
}
// varargs
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin()) {
// The AAPCS variadic function ABI is identical to the non-variadic
@@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
}
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment.
StackOffset = ((StackOffset + 7) & ~7);
- AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+ FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
}
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
- StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+ StackArgSize = alignTo(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
@@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
- SelectionDAG &DAG, SDLoc DL,
+ SelectionDAG &DAG,
+ const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
- false, 0);
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
- false, false, 0);
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
@@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
@@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
- if (i == 0 && isThisReturn) {
+ if (i == 0 && isThisReturn && EnableThisRetForwarding) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
@@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult(
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- bool isCalleeStructRet, bool isCallerStructRet,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
@@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
return false;
- const MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
@@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return false;
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- if (IsTailCallConvention(CalleeCC) && CCMatch)
- return true;
- return false;
+ return IsTailCallConvention(CalleeCC) && CCMatch;
}
// Externally-defined functions with weak linkage should not be
@@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
+ LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
@@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (const CCValAssign &ArgLoc : ArgLocs)
@@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return false;
}
- // If the calling conventions do not match, then we'd better make sure the
- // results are returned in the same way as what the caller expects.
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ CCAssignFnForCall(CalleeCC, isVarArg),
+ CCAssignFnForCall(CallerCC, isVarArg)))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
- SmallVector<CCValAssign, 16> RVLocs1;
- CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
- *DAG.getContext());
- CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
- SmallVector<CCValAssign, 16> RVLocs2;
- CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
- *DAG.getContext());
- CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
- if (RVLocs1.size() != RVLocs2.size())
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
- for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
- if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
- return false;
- if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
- return false;
- if (RVLocs1[i].isRegLoc()) {
- if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
- return false;
- } else {
- if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
- return false;
- }
- }
}
// Nothing more to check if the callee is taking no arguments
@@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return true;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- // If the stack arguments for this call would fit into our own save area then
- // the call can be made tail.
- return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+
+ return true;
}
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
@@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
}
bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
- return CallCC == CallingConv::Fast;
+ return CallCC == CallingConv::Fast ||
+ CallCC == CallingConv::PreserveMost;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
@@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
- bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
- Callee, CallConv, IsVarArg, IsStructRet,
- MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
- NumBytes = RoundUpToAlignment(NumBytes, 16);
+ NumBytes = alignTo(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
@@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getValVT() == MVT::i16)
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
- SDValue Store =
- DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
MemOpChains.push_back(Store);
}
}
@@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
- uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
- ? RoundUpToAlignment(NumBytes, 16)
- : 0;
+ uint64_t CalleePopBytes =
+ DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(CalleePopBytes, DL, true),
@@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- SDLoc DL, SelectionDAG &DAG) const {
+ const SDLoc &DL, SelectionDAG &DAG) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
@@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
}
- if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
- assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
- "use of MO_CONSTPOOL only supported on small model");
- SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
- SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
- SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
- SDValue GlobalAddr = DAG.getLoad(
- PtrVT, DL, DAG.getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /*isVolatile=*/false,
- /*isNonTemporal=*/true,
- /*isInvariant=*/true, 8);
- if (GN->getOffset() != 0)
- return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
- DAG.getConstant(GN->getOffset(), DL, PtrVT));
- return GlobalAddr;
- }
-
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
@@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet =
DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
- true, true, 8);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ /* Alignment = */ 8, MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MOInvariant);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
/// above sequence, and expanded really late in the compilation flow, to ensure
/// the sequence is produced as per above.
-SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+ const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(Chain);
- Ops.push_back(SymAddr);
-
- Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+ Chain =
+ DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
- SDValue FVal, SDLoc dl,
+ SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
getPointerTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
- MachinePointerInfo(SV), false, false, 0);
+ MachinePointerInfo(SV));
}
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
@@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
// void *__stack at offset 0
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
- MachinePointerInfo(SV), false, false, 8));
+ MachinePointerInfo(SV), /* Alignment = */ 8));
// void *__gr_top at offset 8
int GPRSize = FuncInfo->getVarArgsGPRSize();
@@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
DAG.getConstant(GPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
- MachinePointerInfo(SV, 8), false, false, 8));
+ MachinePointerInfo(SV, 8),
+ /* Alignment = */ 8));
}
// void *__vr_top at offset 16
@@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
DAG.getConstant(FPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
- MachinePointerInfo(SV, 16), false, false, 8));
+ MachinePointerInfo(SV, 16),
+ /* Alignment = */ 8));
}
// int __gr_offs at offset 24
SDValue GROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
- MemOps.push_back(DAG.getStore(Chain, DL,
- DAG.getConstant(-GPRSize, DL, MVT::i32),
- GROffsAddr, MachinePointerInfo(SV, 24), false,
- false, 4));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+ MachinePointerInfo(SV, 24), /* Alignment = */ 4));
// int __vr_offs at offset 28
SDValue VROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
- MemOps.push_back(DAG.getStore(Chain, DL,
- DAG.getConstant(-FPRSize, DL, MVT::i32),
- VROffsAddr, MachinePointerInfo(SV, 28), false,
- false, 4));
+ MemOps.push_back(DAG.getStore(
+ Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+ MachinePointerInfo(SV, 28), /* Alignment = */ 4));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
@@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
unsigned Align = Op.getConstantOperandVal(3);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
- false, false, false, 0);
+ SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
if (Align > 8) {
@@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
// Store the incremented VAList to the legalized pointer
- SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
- false, false, 0);
+ SDValue APStore =
+ DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
if (NeedFPTrunc) {
// Load the value as an f64.
- SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
- MachinePointerInfo(), false, false, false, 0);
+ SDValue WideFP =
+ DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
// Round the value down to an f32.
SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
DAG.getIntPtrConstant(1, DL));
@@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(Ops, DL);
}
- return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
- false, false, 0);
+ return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
}
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
- MachinePointerInfo(), false, false, false, 0);
+ MachinePointerInfo());
return FrameAddr;
}
@@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
- MachinePointerInfo(), false, false, false, 0);
+ MachinePointerInfo());
}
// Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const AArch64Subtarget &ST,
+ const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+ const SDValue &Operand, unsigned &ExtraSteps) {
+ if (!ST.hasNEON())
+ return SDValue();
+
+ EVT VT = Operand.getValueType();
+
+ std::string RecipOp;
+ RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
+ RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+ RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ ExtraSteps = Recips.getRefinementSteps(RecipOp);
+ return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
+}
+
+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+ UseOneConst = true;
+ return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+}
+
//===----------------------------------------------------------------------===//
// AArch64 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// is prefixed by the %w modifier. Floating-point and SIMD register operands
// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
// %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+ // At this point, we have to lower this constraint to something else, so we
+ // lower it to an "r" or "w". However, by doing this we will force the result
+ // to be in register, while the X constraint is much more permissive.
+ //
+ // Although we are correct (we are free to emit anything, without
+ // constraints), we might break use cases that would expect us to be more
+ // efficient and emit something else.
+ if (!Subtarget->hasFPARMv8())
+ return "r";
+
+ if (ConstraintVT.isFloatingPoint())
+ return "w";
+
+ if (ConstraintVT.isVector() &&
+ (ConstraintVT.getSizeInBits() == 64 ||
+ ConstraintVT.getSizeInBits() == 128))
+ return "w";
+
+ return "r";
+}
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
@@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
int RegNo;
bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
if (!Failed && RegNo >= 0 && RegNo <= 31) {
- // v0 - v31 are aliases of q0 - q31.
+ // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
- Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
- Res.second = &AArch64::FPR128RegClass;
+ if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+ Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR64RegClass;
+ } else {
+ Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR128RegClass;
+ }
}
}
}
@@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
- else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(V.getOperand(1))) {
// A shuffle can only come from building a vector from various
- // elements of other vectors.
+ // elements of other vectors, provided their indices are constant.
return SDValue();
}
@@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
- if (Entry.getOpcode() == ISD::UNDEF)
+ if (Entry.isUndef())
continue;
auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
- ShuffleOps[1], &Mask[0]);
+ ShuffleOps[1], Mask);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
@@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
- SDLoc dl) {
+ const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
@@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
- if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+ if (V2.getNode()->isUndef()) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
// table registers.
// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
- // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- // &TBLMask[0], IndexLen));
+ // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+ // IndexLen));
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
- DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
- V1Cst, V2Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
- makeArrayRef(TBLMask.data(), IndexLen)));
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+ V2Cst, DAG.getBuildVector(IndexVT, DL,
+ makeArrayRef(TBLMask.data(), IndexLen)));
}
}
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
@@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
- V1.getValueType().getSimpleVT())) {
+ if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)
@@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
- } else if (V2->getOpcode() == ISD::UNDEF &&
- isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+ } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
@@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
- SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
- if (Concat.getNode())
+ if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
return Concat;
bool DstIsLeft;
@@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (EnableAArch64SlrGeneration) {
- SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
- if (Res.getNode())
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
}
@@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op,
}
Ops.push_back(Lane);
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+ return DAG.getBuildVector(VT, dl, Ops);
}
SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
@@ -6217,7 +6311,7 @@ FailedModImm:
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
if (i > 0)
isOnlyLowElement = false;
@@ -6273,7 +6367,7 @@ FailedModImm:
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
- SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+ SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6328,7 +6422,7 @@ FailedModImm:
// value is already in an S or D register.
// Do not do this for UNDEF/LOAD nodes because we have better patterns
// for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
- if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+ if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
(ElemSize == 32 || ElemSize == 64)) {
unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
MachineSDNode *N =
@@ -6339,7 +6433,7 @@ FailedModImm:
}
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
+ if (V.isUndef())
continue;
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode CC, bool NoNans, EVT VT,
- SDLoc dl, SelectionDAG &DAG) {
+ const SDLoc &dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
"function only supposed to emit natural comparisons");
@@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
const DataLayout &DL = I->getModule()->getDataLayout();
EVT VT = getValueType(DL, User->getOperand(0)->getType());
- if (isFMAFasterThanFMulAndFAdd(VT) &&
- isOperationLegalOrCustom(ISD::FMA, VT) &&
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
- return false;
-
- return true;
+ return !(isFMAFasterThanFMulAndFAdd(VT) &&
+ isOperationLegalOrCustom(ISD::FMA, VT) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
- if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
- return true;
- return false;
+ // Avoid UB for INT64_MIN.
+ if (Immed == std::numeric_limits<int64_t>::min())
+ return false;
+ // Same encoding for add/sub, just flip the sign.
+ Immed = std::abs(Immed);
+ return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
}
// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
// immediates is the same as for an add or a sub.
bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
- if (Immed < 0)
- Immed *= -1;
return isLegalAddImmediate(Immed);
}
@@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
- if (!AM.Scale || AM.Scale == 1 ||
- (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
- return true;
- return false;
+ return !AM.Scale || AM.Scale == 1 ||
+ (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
@@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return Shift < 3;
}
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!Subtarget->hasNEON() || !VT.isVector())
+ return SDValue();
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ return SDValue();
+
+ return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
// Generate SUBS and CSEL for integer abs.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// performXorCombine - Attempts to handle integer ABS.
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
return performIntegerAbsCombine(N, DAG);
}
@@ -7376,6 +7496,10 @@ SDValue
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
std::vector<SDNode *> *Created) const {
+ AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if ((VT != MVT::i32 && VT != MVT::i64) ||
@@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- APInt Value = C->getAPIntValue();
+ const APInt &Value = C->getAPIntValue();
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (Value.isNonNegative()) {
@@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getPointerInfo(), LN0->isVolatile(),
- LN0->isNonTemporal(), LN0->isInvariant(),
- LN0->getAlignment());
+ LN0->getPointerInfo(), LN0->getAlignment(),
+ LN0->getMemOperand()->getFlags());
// Make sure successors of the original load stay after it by updating them
// to use the new Chain.
@@ -7562,12 +7685,14 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
/// Fold a floating-point multiply by power of two into floating-point to
/// fixed-point conversion.
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
- if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+ if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+ Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
@@ -7604,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
- ResTy = MVT::v4i32;
+ ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
+ if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+ "Illegal vector type after legalization");
+
SDLoc DL(N);
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
@@ -7801,25 +7932,49 @@ static SDValue tryCombineToBSL(SDNode *N,
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
- if (!EnableAArch64ExtrGeneration)
- return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
- SDValue Res = tryCombineToEXTR(N, DCI);
- if (Res.getNode())
+ if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
- Res = tryCombineToBSL(N, DCI);
- if (Res.getNode())
+ if (SDValue Res = tryCombineToBSL(N, DCI))
return Res;
return SDValue();
}
+static SDValue performSRLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+ // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+ // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() == ISD::BSWAP) {
+ SDLoc DL(N);
+ SDValue N1 = N->getOperand(1);
+ SDValue N00 = N0.getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ uint64_t ShiftAmt = C->getZExtValue();
+ if (VT == MVT::i32 && ShiftAmt == 16 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ if (VT == MVT::i64 && ShiftAmt == 32 &&
+ DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+ return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+ }
+ }
+ return SDValue();
+}
+
static SDValue performBitcastCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -8575,15 +8730,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
SDValue BasePtr = St->getBasePtr();
SDValue NewST1 =
DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
- St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+ St->getAlignment(), St->getMemOperand()->getFlags());
unsigned Offset = EltOffset;
while (--NumVecElts) {
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), Alignment);
+ St->getPointerInfo(), Alignment,
+ St->getMemOperand()->getFlags());
Offset += EltOffset;
}
return NewST1;
@@ -8603,9 +8758,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
- // Cyclone has bad performance on unaligned 16B stores when crossing line and
- // page boundaries. We want to split such stores.
- if (!Subtarget->isCyclone())
+ if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.
@@ -8647,12 +8800,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
- S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+ S->getAlignment(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
- S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
- S->getAlignment());
+ S->getPointerInfo(), S->getAlignment(),
+ S->getMemOperand()->getFlags());
}
/// Target-specific DAG combine function for post-increment LD1 (lane) and
@@ -8741,9 +8894,10 @@ static SDValue performPostLD1Combine(SDNode *N,
LoadSDN->getMemOperand());
// Update the uses.
- SmallVector<SDValue, 2> NewResults;
- NewResults.push_back(SDValue(LD, 0)); // The result of load
- NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+ SDValue NewResults[] = {
+ SDValue(LD, 0), // The result of load
+ SDValue(UpdN.getNode(), 2) // Chain
+ };
DCI.CombineTo(LD, NewResults);
DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
@@ -8774,8 +8928,7 @@ static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
- if (Split.getNode())
+ if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
@@ -9215,10 +9368,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
}
case ISD::Constant:
case ISD::TargetConstant: {
- if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
- 1LL << (width - 1))
- return true;
- return false;
+ return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+ 1LL << (width - 1);
}
}
@@ -9286,14 +9437,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
// isEquivalentMaskless() is the code for testing if the AND can be removed
// factored out of the DAG recognition as the DAG can take several forms.
-static
-bool isEquivalentMaskless(unsigned CC, unsigned width,
- ISD::LoadExtType ExtType, signed AddConstant,
- signed CompConstant) {
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+ ISD::LoadExtType ExtType, int AddConstant,
+ int CompConstant) {
// By being careful about our equations and only writing the in term
// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
// make them generally applicable to all bit widths.
- signed MaxUInt = (1 << width);
+ int MaxUInt = (1 << width);
// For the purposes of these comparisons sign extending the type is
// equivalent to zero extending the add and displacing it by half the integer
@@ -9441,8 +9591,7 @@ SDValue performCONDCombine(SDNode *N,
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
- SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
- if (NV.getNode())
+ if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
@@ -9678,7 +9827,7 @@ static SDValue performSelectCombine(SDNode *N,
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
- SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);
@@ -9711,11 +9860,13 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performIntToFpCombine(N, DAG, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
- return performFpToIntCombine(N, DAG, Subtarget);
+ return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
return performFDivCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
+ case ISD::SRL:
+ return performSRLCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicCombine(N, DCI, Subtarget);
case ISD::ANY_EXTEND:
@@ -9829,10 +9980,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
// return instructions to help enable tail call optimizations for this
// instruction.
bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
- if (!CI->isTailCall())
- return false;
-
- return true;
+ return CI->isTailCall();
}
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
@@ -9935,6 +10083,38 @@ static void ReplaceReductionResults(SDNode *N,
Results.push_back(SplitVal);
}
+static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
+ DAG.getNode(ISD::SRL, DL, MVT::i128, N,
+ DAG.getConstant(64, DL, MVT::i64)));
+ return std::make_pair(Lo, Hi);
+}
+
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+ SmallVectorImpl<SDValue> & Results,
+ SelectionDAG &DAG) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "AtomicCmpSwap on types less than 128 should be legal");
+ auto Desired = splitInt128(N->getOperand(2), DAG);
+ auto New = splitInt128(N->getOperand(3), DAG);
+ SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
+ New.first, New.second, N->getOperand(0)};
+ SDNode *CmpSwap = DAG.getMachineNode(
+ AArch64::CMP_SWAP_128, SDLoc(N),
+ DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+ Results.push_back(SDValue(CmpSwap, 0));
+ Results.push_back(SDValue(CmpSwap, 1));
+ Results.push_back(SDValue(CmpSwap, 3));
+}
+
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -9966,11 +10146,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.
return;
+ case ISD::ATOMIC_CMP_SWAP:
+ ReplaceCMP_SWAP_128Results(N, Results, DAG);
+ return;
}
}
bool AArch64TargetLowering::useLoadStackGuardNode() const {
- return true;
+ if (!Subtarget->isTargetAndroid() && !Subtarget->isTargetOpenBSD())
+ return true;
+ return TargetLowering::useLoadStackGuardNode();
}
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10017,14 +10202,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
- return true;
+ // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+ // implement cmpxchg without spilling. If the address being exchanged is also
+ // on the stack and close enough to the spill slot, this can lead to a
+ // situation where the monitor always gets cleared and the atomic operation
+ // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+ return getTargetMachine().getOptLevel() != 0;
}
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
- bool IsAcquire = isAtLeastAcquire(Ord);
+ bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
@@ -10066,7 +10256,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- bool IsRelease = isAtLeastRelease(Ord);
+ bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -10104,6 +10294,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
return false;
}
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getIRStackGuard(IRB);
+
+ // Android provides a fixed TLS slot for the stack cookie. See the definition
+ // of TLS_SLOT_STACK_GUARD in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x28;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (!Subtarget->isTargetAndroid())
return TargetLowering::getSafeStackPointerLocation(IRB);
@@ -10114,7 +10320,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
const unsigned TlsOffset = 0x48;
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
- Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
@@ -10166,3 +10372,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
.addReg(NewVR);
}
}
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on AArch64 is expensive. However, when aggressively
+ // optimizing for code size, we prefer to use a div instruction, as it is
+ // usually smaller than the alternative sequence.
+ // The exception to this is vector division. Since AArch64 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize =
+ Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 151133b2f32..91905b2c383 100644
--- a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -32,38 +33,64 @@ class StringRef;
class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
-protected:
- enum ARMProcFamilyEnum {
+public:
+ enum ARMProcFamilyEnum : uint8_t {
Others,
CortexA35,
CortexA53,
CortexA57,
+ CortexA72,
+ CortexA73,
Cyclone,
- ExynosM1
+ ExynosM1,
+ Kryo,
+ Vulcan
};
+protected:
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
- ARMProcFamilyEnum ARMProcFamily;
+ ARMProcFamilyEnum ARMProcFamily = Others;
- bool HasV8_1aOps;
- bool HasV8_2aOps;
+ bool HasV8_1aOps = false;
+ bool HasV8_2aOps = false;
- bool HasFPARMv8;
- bool HasNEON;
- bool HasCrypto;
- bool HasCRC;
- bool HasPerfMon;
- bool HasFullFP16;
- bool HasSPE;
+ bool HasFPARMv8 = false;
+ bool HasNEON = false;
+ bool HasCrypto = false;
+ bool HasCRC = false;
+ bool HasRAS = false;
+ bool HasPerfMon = false;
+ bool HasFullFP16 = false;
+ bool HasSPE = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
- bool HasZeroCycleRegMove;
+ bool HasZeroCycleRegMove = false;
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
- bool HasZeroCycleZeroing;
+ bool HasZeroCycleZeroing = false;
// StrictAlign - Disallow unaligned memory accesses.
- bool StrictAlign;
+ bool StrictAlign = false;
+ bool MergeNarrowLoads = false;
+ bool UseAA = false;
+ bool PredictableSelectIsExpensive = false;
+ bool BalanceFPOps = false;
+ bool CustomAsCheapAsMove = false;
+ bool UsePostRAScheduler = false;
+ bool Misaligned128StoreIsSlow = false;
+ bool AvoidQuadLdStPairs = false;
+ bool UseAlternateSExtLoadCVTF32Pattern = false;
+ bool HasMacroOpFusion = false;
+ bool DisableLatencySchedHeuristic = false;
+ bool UseRSqrt = false;
+ uint8_t MaxInterleaveFactor = 2;
+ uint8_t VectorInsertExtractBaseCost = 3;
+ uint16_t CacheLineSize = 0;
+ uint16_t PrefetchDistance = 0;
+ uint16_t MinPrefetchStride = 1;
+ unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+ unsigned PrefFunctionAlignment = 0;
+ unsigned PrefLoopAlignment = 0;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@@ -80,12 +107,20 @@ protected:
AArch64InstrInfo InstrInfo;
AArch64SelectionDAGInfo TSInfo;
AArch64TargetLowering TLInfo;
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
+
private:
/// initializeSubtargetDependencies - Initializes using CPUString and the
/// passed in feature string so that we can use initializer lists for
/// subtarget initialization.
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
+ /// Initialize properties based on the selected processor family.
+ void initializeProperties();
+
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
@@ -93,6 +128,11 @@ public:
const std::string &FS, const TargetMachine &TM,
bool LittleEndian);
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) {
+ this->GISel.reset(&GISel);
+ }
+
const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
@@ -106,10 +146,20 @@ public:
const AArch64RegisterInfo *getRegisterInfo() const override {
return &getInstrInfo()->getRegisterInfo();
}
+ const CallLowering *getCallLowering() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
- return isGeneric() || isCortexA53() || isCortexA57();
+ return UsePostRAScheduler;
+ }
+
+ /// Returns ARM processor family.
+ /// Avoid this function! CPU specifics should be kept local to this class
+ /// and preferably modeled with SubtargetFeatures or properties in
+ /// initializeProperties().
+ ARMProcFamilyEnum getProcFamily() const {
+ return ARMProcFamily;
}
bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -126,6 +176,33 @@ public:
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
+ bool hasRAS() const { return HasRAS; }
+ bool mergeNarrowLoads() const { return MergeNarrowLoads; }
+ bool balanceFPOps() const { return BalanceFPOps; }
+ bool predictableSelectIsExpensive() const {
+ return PredictableSelectIsExpensive;
+ }
+ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+ bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+ bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+ bool useAlternateSExtLoadCVTF32Pattern() const {
+ return UseAlternateSExtLoadCVTF32Pattern;
+ }
+ bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+ bool useRSqrt() const { return UseRSqrt; }
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+ unsigned getVectorInsertExtractBaseCost() const {
+ return VectorInsertExtractBaseCost;
+ }
+ unsigned getCacheLineSize() const { return CacheLineSize; }
+ unsigned getPrefetchDistance() const { return PrefetchDistance; }
+ unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+ unsigned getMaxPrefetchIterationsAhead() const {
+ return MaxPrefetchIterationsAhead;
+ }
+ unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+ unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
@@ -139,6 +216,7 @@ public:
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
bool isTargetIOS() const { return TargetTriple.isiOS(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetOpenBSD() const { return TargetTriple.isOSOpenBSD(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
@@ -146,13 +224,7 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
- bool isGeneric() const { return CPUString == "generic"; }
- bool isCyclone() const { return CPUString == "cyclone"; }
- bool isCortexA57() const { return CPUString == "cortex-a57"; }
- bool isCortexA53() const { return CPUString == "cortex-a53"; }
- bool isExynosM1() const { return CPUString == "exynos-m1"; }
-
- bool useAA() const override { return isCortexA53(); }
+ bool useAA() const override { return UseAA; }
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -174,8 +246,7 @@ public:
/// returns null.
const char *getBZeroEntry() const;
- void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
- MachineInstr *end,
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
bool enableEarlyIfConversion() const override;