diff options
author | Patrick Wildt <patrick@cvs.openbsd.org> | 2017-01-14 20:19:40 +0000 |
---|---|---|
committer | Patrick Wildt <patrick@cvs.openbsd.org> | 2017-01-14 20:19:40 +0000 |
commit | f444bdcbea9aa072c5d8199d4a7b1288a44b40ac (patch) | |
tree | 6f60ced3b255dbfe9ef5abcc6f8ab372457023a9 /gnu/llvm | |
parent | 0bd56e6caa13367b1386362329946770bc35feff (diff) |
Disable the Load Stack Guard for OpenBSD on AArch64. We don't use it
on any other platform and it causes a segfault in combination with our
IR Stack Guard.
"looks reasonable" kettenis@
"looks good to me" stefan@
Diffstat (limited to 'gnu/llvm')
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 981 | ||||
-rw-r--r-- | gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h | 123 |
2 files changed, 697 insertions, 407 deletions
diff --git a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 92cf1cd7197..13722c37c57 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/gnu/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,12 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -// Place holder until extr generation is tested fully. -static cl::opt<bool> -EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, - cl::desc("Allow AArch64 (or (shift)(shift))->extract"), - cl::init(true)); - static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), @@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +// Disabled for causing self-hosting failures once returned-attribute inference +// was enabled. +static cl::opt<bool> +EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden, + cl::desc("Directly forward this return"), + cl::init(false)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero - // counterparts, which AArch64 supports directly. - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); @@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); - + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); @@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; + // Set required alignment. setMinFunctionAlignment(2); + // Set preferred alignments. + setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); + setPrefLoopAlignment(STI.getPrefLoopAlignment()); setHasExtractBitsInsn(true); @@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); + + setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); + // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. @@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // Prefer likely predicted branches to selects on out-of-order cores. - if (Subtarget->isCortexA57()) - PredictableSelectIsExpensive = true; + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } -void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { +void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i32); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i32); } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i64); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i64); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { - setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); // But we do support custom-lowering for FCOPYSIGN. - setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); - } - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); - setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - - setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); + } + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) - setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); // [SU][MIN|MAX] are available for all NEON types apart from i64. - if (!VT.isFloatingPoint() && - VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, VT.getSimpleVT(), Legal); - setIndexedStoreAction(im, VT.getSimpleVT(), Legal); + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); } } } @@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (Subtarget->requiresStrictAlign()) return false; - // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { - // FIXME: Define an attribute for slow unaligned accesses instead of - // relying on the CPU type as a proxy. - // On Cyclone, unaligned 128-bit stores are slow. - *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. @@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; } return nullptr; } MachineBasicBlock * -AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, +AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: @@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned IfTrueReg = MI->getOperand(1).getReg(); - unsigned IfFalseReg = MI->getOperand(2).getReg(); - unsigned CondCode = MI->getOperand(3).getImm(); - bool NZCVKilled = MI->getOperand(4).isKill(); + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned IfTrueReg = MI.getOperand(1).getReg(); + unsigned IfFalseReg = MI.getOperand(2).getReg(); + unsigned CondCode = MI.getOperand(3).getImm(); + bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, .addReg(IfFalseReg) .addMBB(MBB); - MI->eraseFromParent(); + MI.eraseFromParent(); return EndBB; } -MachineBasicBlock * -AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { - switch (MI->getOpcode()) { +MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { default: #ifndef NDEBUG - MI->dump(); + MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); @@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC, } } +/// Convert a DAG fp condition code to an AArch64 CC. +/// This differs from changeFPCCToAArch64CC in that it returns cond codes that +/// should be AND'ed instead of OR'ed. +static void changeFPCCToANDAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + changeFPCCToAArch64CC(CC, CondCode, CondCode2); + assert(CondCode2 == AArch64CC::AL); + break; + case ISD::SETONE: + // (a one b) + // == ((a olt b) || (a ogt b)) + // == ((a ord b) && (a une b)) + CondCode = AArch64CC::VC; + CondCode2 = AArch64CC::NE; + break; + case ISD::SETUEQ: + // (a ueq b) + // == ((a uno b) || (a oeq b)) + // == ((a ule b) && (a uge b)) + CondCode = AArch64CC::PL; + CondCode2 = AArch64CC::LE; + break; + } +} + /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations @@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) { } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); - if (VT.isFloatingPoint()) + if (VT.isFloatingPoint()) { + assert(VT != MVT::f128); + if (VT == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + VT = MVT::f32; + } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); + } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. @@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, - SDValue Condition, unsigned NZCV, - SDLoc DL, SelectionDAG &DAG) { + AArch64CC::CondCode Predicate, + AArch64CC::CondCode OutCC, + const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; - if (LHS.getValueType().isFloatingPoint()) + if (LHS.getValueType().isFloatingPoint()) { + assert(LHS.getValueType() != MVT::f128); + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); + } Opcode = AArch64ISD::FCCMP; - else if (RHS.getOpcode() == ISD::SUB) { + } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // See emitComparison() on why we can only do this for SETEQ and SETNE. - Opcode = AArch64ISD::CCMN; - RHS = RHS.getOperand(1); - } + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; + SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } @@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be /// brought into such a form. -static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { - CanPushNegate = true; + if (Val->getOperand(0).getValueType() == MVT::f128) + return false; + CanNegate = true; return true; } - // Protect against stack overflow. - if (Depth > 15) + // Protect against exponential runtime and stack overflow. + if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + bool CanNegateL; + if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) return false; - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + bool CanNegateR; + if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) return false; - // We cannot push a negate through an AND operation (it would become an OR), - // we can however change a (not (or x y)) to (and (not x) (not y)) if we can - // push the negate through the x/y subtrees. - CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + + if (Opcode == ISD::OR) { + // For an OR expression we need to be able to negate at least one side or + // we cannot do the transformation at all. + if (!CanNegateL && !CanNegateR) + return false; + // We can however change a (not (or x y)) to (and (not x) (not y)) if we + // can negate the x and y subtrees. + CanNegate = CanNegateL && CanNegateR; + } else { + // If the operands are OR expressions then we finally need to negate their + // outputs, we can only do that for the operand with emitted last by + // negating OutCC, not for both operands. + bool NeedsNegOutL = O0->getOpcode() == ISD::OR; + bool NeedsNegOutR = O1->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return false; + // We cannot negate an AND operation (it would become an OR), + CanNegate = false; + } return true; } return false; @@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate /// for the comparisons in the current subtree; @p Depth limits the search /// depth to avoid stack overflow. -static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, - AArch64CC::CondCode &OutCC, bool PushNegate = false, - SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, - unsigned Depth = 0) { +static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, + AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { @@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); - if (PushNegate) + if (Negate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. @@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; - changeFPCCToAArch64CC(CC, OutCC, ExtraCC); - // Surpisingly some floating point conditions can't be tested with a - // single condition code. Construct an additional comparison in this case. - // See comment below on how we deal with OR conditions. + changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); + // Some floating point conditions can't be tested with a single condition + // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); - else { - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - // Note that we want the inverse of ExtraCC, so NZCV is not inversed. - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); - ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, - NZCV, DL, DAG); - } + else + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, + ExtraCC, DL, DAG); CCOp = ExtraCmp; - Predicate = AArch64CC::getInvertedCondCode(ExtraCC); - OutCC = AArch64CC::getInvertedCondCode(OutCC); + Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain - if (!CCOp.getNode()) + if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); - return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); - } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) - return SDValue(); - - assert((Opcode == ISD::OR || !PushNegate) - && "Can only push negate through OR operation"); + } + assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && + "Valid conjunction/disjunction tree"); // Check if both sides can be transformed. SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) - return SDValue(); - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) - return SDValue(); - // Do we need to negate our operands? - bool NegateOperands = Opcode == ISD::OR; + // In case of an OR we need to negate our operands and the result. + // (A v B) <=> not(not(A) ^ not(B)) + bool NegateOpsAndResult = Opcode == ISD::OR; // We can negate the results of all previous operations by inverting the - // predicate flags giving us a free negation for one side. For the other side - // we need to be able to push the negation to the leafs of the tree. - if (NegateOperands) { - if (!CanPushNegateL && !CanPushNegateR) - return SDValue(); - // Order the side where we can push the negate through to LHS. - if (!CanPushNegateL && CanPushNegateR) + // predicate flags giving us a free negation for one side. The other side + // must be negatable by itself. + if (NegateOpsAndResult) { + // See which side we can negate. + bool CanNegateL; + bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); + assert(isValidL && "Valid conjunction/disjunction tree"); + (void)isValidL; + +#ifndef NDEBUG + bool CanNegateR; + bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); + assert(isValidR && "Valid conjunction/disjunction tree"); + assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); +#endif + + // Order the side which we cannot negate to RHS so we can emit it first. + if (!CanNegateL) std::swap(LHS, RHS); } else { bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; - bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; - if (NeedsNegOutL && NeedsNegOutR) - return SDValue(); + assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && + "Valid conjunction/disjunction tree"); // Order the side where we need to negate the output flags to RHS so it // gets emitted first. if (NeedsNegOutL) @@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, // through if we are already in a PushNegate case, otherwise we can negate // the "flags to test" afterwards. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, - CCOp, Predicate, Depth+1); - if (NegateOperands && !PushNegate) + SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, + CCOp, Predicate); + if (NegateOpsAndResult && !Negate) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); - // Emit LHS. We must push the negate through if we need to negate it. - SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, - CmpR, RHSCC, Depth+1); + // Emit LHS. We may need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, + NegateOpsAndResult, CmpR, + RHSCC); // If we transformed an OR to and AND then we have to negate the result - // (or absorb a PushNegate resulting in a double negation). - if (Opcode == ISD::OR && !PushNegate) + // (or absorb the Negate parameter). + if (NegateOpsAndResult && !Negate) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// \see emitConjunctionDisjunctionTreeRec(). +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC) { + bool CanNegate; + if (!isConjunctionDisjunctionTree(Val, CanNegate)) + return SDValue(); + + return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), + AArch64CC::AL); +} + /// @} static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + SDValue &AArch64cc, SelectionDAG &DAG, + const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::getVectorVT(TruncVT, NumElts), Ops); + return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { @@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::aarch64_thread_pointer: { + case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } @@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: + case CallingConv::PreserveMost: + case CallingConv::CXX_FAST_TLS: if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; @@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - MemVT, false, false, false, 0); + MemVT); InVals.push_back(ArgValue); } } // varargs + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { if (!Subtarget->isTargetDarwin()) { // The AAPCS variadic function ABI is identical to the non-variadic @@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments( saveVarArgRegisters(CCInfo, DAG, DL, Chain); } - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); - AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); } - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: - StackArgSize = RoundUpToAlignment(StackArgSize, 16); + StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. @@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, - SelectionDAG &DAG, SDLoc DL, + SelectionDAG &DAG, + const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, - false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), - false, false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals, bool isThisReturn, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS @@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult( // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference - if (i == 0 && isThisReturn) { + if (i == 0 && isThisReturn && EnableThisRetForwarding) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); @@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult( bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { @@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; if (getTargetMachine().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) - return true; - return false; + return IsTailCallConvention(CalleeCC) && CCMatch; } // Externally-defined functions with weak linkage should not be @@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); + LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If @@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) @@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; } - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + CCAssignFnForCall(CalleeCC, isVarArg), + CCAssignFnForCall(CallerCC, isVarArg))) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { - SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, - *DAG.getContext()); - CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); - - SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, - *DAG.getContext()); - CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); - - if (RVLocs1.size() != RVLocs2.size()) + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } } // Nothing more to check if the callee is taking no arguments @@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - // If the stack arguments for this call would fit into our own save area then - // the call can be made tail. - return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + return false; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + + return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, @@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, } bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { - return CallCC == CallingConv::Fast; + return CallCC == CallingConv::Fast || + CallCC == CallingConv::PreserveMost; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, @@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, IsStructRet, - MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); + Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. - NumBytes = RoundUpToAlignment(NumBytes, 16); + NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we @@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); - SDValue Store = - DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } @@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); - uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) - ? RoundUpToAlignment(NumBytes, 16) - : 0; + uint64_t CalleePopBytes = + DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), @@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const { + const SDLoc &DL, SelectionDAG &DAG) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; @@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } - if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { - assert(getTargetMachine().getCodeModel() == CodeModel::Small && - "use of MO_CONSTPOOL only supported on small model"); - SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); - SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad( - PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /*isVolatile=*/false, - /*isNonTemporal=*/true, - /*isInvariant=*/true, 8); - if (GN->getOffset() != 0) - return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, - DAG.getConstant(GN->getOffset(), DL, PtrVT)); - return GlobalAddr; - } - if (getTargetMachine().getCodeModel() == CodeModel::Large) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( @@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad(MVT::i64, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, - true, true, 8); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 8, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. -SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, +SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, + const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - SmallVector<SDValue, 2> Ops; - Ops.push_back(Chain); - Ops.push_back(SymAddr); - - Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); + Chain = + DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); @@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, - SDValue FVal, SDLoc dl, + SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, getPointerTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); + MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, @@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), false, false, 8)); + MachinePointerInfo(SV), /* Alignment = */ 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); @@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, 8), false, false, 8)); + MachinePointerInfo(SV, 8), + /* Alignment = */ 8)); } // void *__vr_top at offset 16 @@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, 16), false, false, 8)); + MachinePointerInfo(SV, 16), + /* Alignment = */ 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-GPRSize, DL, MVT::i32), - GROffsAddr, MachinePointerInfo(SV, 24), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, + MachinePointerInfo(SV, 24), /* Alignment = */ 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-FPRSize, DL, MVT::i32), - VROffsAddr, MachinePointerInfo(SV, 28), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, + MachinePointerInfo(SV, 28), /* Alignment = */ 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), - false, false, false, 0); + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); if (Align > 8) { @@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer - SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), - false, false, 0); + SDValue APStore = + DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. - SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, - MachinePointerInfo(), false, false, false, 0); + SDValue WideFP = + DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); @@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } - return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, - false, false, 0); + return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, @@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); return FrameAddr; } @@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. @@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const AArch64Subtarget &ST, + const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, + const SDValue &Operand, unsigned &ExtraSteps) { + if (!ST.hasNEON()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + std::string RecipOp; + RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; + RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; + RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + ExtraSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { + return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { + UseOneConst = true; + return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); +} + //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. +const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { + // At this point, we have to lower this constraint to something else, so we + // lower it to an "r" or "w". However, by doing this we will force the result + // to be in register, while the X constraint is much more permissive. + // + // Although we are correct (we are free to emit anything, without + // constraints), we might break use cases that would expect us to be more + // efficient and emit something else. + if (!Subtarget->hasFPARMv8()) + return "r"; + + if (ConstraintVT.isFloatingPoint()) + return "w"; + + if (ConstraintVT.isVector() && + (ConstraintVT.getSizeInBits() == 64 || + ConstraintVT.getSizeInBits() == 128)) + return "w"; + + return "r"; +} /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. @@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { - // v0 - v31 are aliases of q0 - q31. + // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. - Res.first = AArch64::FPR128RegClass.getRegister(RegNo); - Res.second = &AArch64::FPR128RegClass; + if (VT != MVT::Other && VT.getSizeInBits() == 64) { + Res.first = AArch64::FPR64RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR64RegClass; + } else { + Res.first = AArch64::FPR128RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR128RegClass; + } } } } @@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; - else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(V.getOperand(1))) { // A shuffle can only come from building a vector from various - // elements of other vectors. + // elements of other vectors, provided their indices are constant. return SDValue(); } @@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) + if (Entry.isUndef()) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); @@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], &Mask[0]); + ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - SDLoc dl) { + const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); @@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; - if (V2.getNode()->getOpcode() == ISD::UNDEF) { + if (V2.getNode()->isUndef()) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, - // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - // &TBLMask[0], IndexLen)); + // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], + // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), - V1Cst, V2Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, + V2Cst, DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], - V1.getValueType().getSimpleVT())) { + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) @@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); - } else if (V2->getOpcode() == ISD::UNDEF && - isSingletonEXTMask(ShuffleMask, VT, Imm)) { + } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); @@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } - SDValue Concat = tryFormConcatFromShuffle(Op, DAG); - if (Concat.getNode()) + if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; @@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { - SDValue Res = tryLowerToSLI(Op.getNode(), DAG); - if (Res.getNode()) + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; } @@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op, } Ops.push_back(Lane); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, @@ -6217,7 +6311,7 @@ FailedModImm: SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; @@ -6273,7 +6367,7 @@ FailedModImm: for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -6328,7 +6422,7 @@ FailedModImm: // value is already in an S or D register. // Do not do this for UNDEF/LOAD nodes because we have better patterns // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = @@ -6339,7 +6433,7 @@ FailedModImm: } for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); @@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); @@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); - if (isFMAFasterThanFMulAndFAdd(VT) && - isOperationLegalOrCustom(ISD::FMA, VT) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) - return false; - - return true; + return !(isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { - if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) - return true; - return false; + // Avoid UB for INT64_MIN. + if (Immed == std::numeric_limits<int64_t>::min()) + return false; + // Same encoding for add/sub, just flip the sign. + Immed = std::abs(Immed); + return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { - if (Immed < 0) - Immed *= -1; return isLegalAddImmediate(Immed); } @@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - if (!AM.Scale || AM.Scale == 1 || - (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) - return true; - return false; + return !AM.Scale || AM.Scale == 1 || + (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, @@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return Shift < 3; } +/// Turn vector tests of the signbit in the form of: +/// xor (sra X, elt_size(X)-1), -1 +/// into: +/// cmge X, X, #0 +static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!Subtarget->hasNEON() || !VT.isVector()) + return SDValue(); + + // There must be a shift right algebraic before the xor, and the xor must be a + // 'not' operation. + SDValue Shift = N->getOperand(0); + SDValue Ones = N->getOperand(1); + if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || + !ISD::isBuildVectorAllOnes(Ones.getNode())) + return SDValue(); + + // The shift should be smearing the sign bit across each vector element. + auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); + EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); + if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + return SDValue(); + + return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); +} + // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// performXorCombine - Attempts to handle integer ABS. static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) + return Cmp; + return performIntegerAbsCombine(N, DAG); } @@ -7376,6 +7496,10 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || @@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - APInt Value = C->getAPIntValue(); + const APInt &Value = C->getAPIntValue(); EVT VT = N->getValueType(0); SDLoc DL(N); if (Value.isNonNegative()) { @@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->isVolatile(), - LN0->isNonTemporal(), LN0->isInvariant(), - LN0->getAlignment()); + LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. @@ -7562,12 +7685,14 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); - if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || + Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); @@ -7604,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: - ResTy = MVT::v4i32; + ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } + if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) + return SDValue(); + + assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && + "Illegal vector type after legalization"); + SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs @@ -7801,25 +7932,49 @@ static SDValue tryCombineToBSL(SDNode *N, static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) - if (!EnableAArch64ExtrGeneration) - return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - SDValue Res = tryCombineToEXTR(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; - Res = tryCombineToBSL(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; return SDValue(); } +static SDValue performSRLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the + // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) + // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() == ISD::BSWAP) { + SDLoc DL(N); + SDValue N1 = N->getOperand(1); + SDValue N00 = N0.getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { + uint64_t ShiftAmt = C->getZExtValue(); + if (VT == MVT::i32 && ShiftAmt == 16 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + if (VT == MVT::i64 && ShiftAmt == 32 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + } + } + return SDValue(); +} + static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -8575,15 +8730,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), St->getAlignment()); + St->getAlignment(), St->getMemOperand()->getFlags()); unsigned Offset = EltOffset; while (--NumVecElts) { SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), Alignment); + St->getPointerInfo(), Alignment, + St->getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; @@ -8603,9 +8758,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundaries. We want to split such stores. - if (!Subtarget->isCyclone()) + if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. @@ -8647,12 +8800,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), - S->isVolatile(), S->isNonTemporal(), S->getAlignment()); + S->getAlignment(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, - S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), - S->getAlignment()); + S->getPointerInfo(), S->getAlignment(), + S->getMemOperand()->getFlags()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and @@ -8741,9 +8894,10 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - SmallVector<SDValue, 2> NewResults; - NewResults.push_back(SDValue(LD, 0)); // The result of load - NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + SDValue NewResults[] = { + SDValue(LD, 0), // The result of load + SDValue(UpdN.getNode(), 2) // Chain + }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register @@ -8774,8 +8928,7 @@ static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - SDValue Split = split16BStores(N, DCI, DAG, Subtarget); - if (Split.getNode()) + if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && @@ -9215,10 +9368,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { } case ISD::Constant: case ISD::TargetConstant: { - if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < - 1LL << (width - 1)) - return true; - return false; + return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < + 1LL << (width - 1); } } @@ -9286,14 +9437,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. -static -bool isEquivalentMaskless(unsigned CC, unsigned width, - ISD::LoadExtType ExtType, signed AddConstant, - signed CompConstant) { +static bool isEquivalentMaskless(unsigned CC, unsigned width, + ISD::LoadExtType ExtType, int AddConstant, + int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. - signed MaxUInt = (1 << width); + int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer @@ -9441,8 +9591,7 @@ SDValue performCONDCombine(SDNode *N, static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { - SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); - if (NV.getNode()) + if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); @@ -9678,7 +9827,7 @@ static SDValue performSelectCombine(SDNode *N, // Now duplicate the comparison mask we want across all other lanes. SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); - SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); @@ -9711,11 +9860,13 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - return performFpToIntCombine(N, DAG, Subtarget); + return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); + case ISD::SRL: + return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -9829,10 +9980,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - - return true; + return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, @@ -9935,6 +10083,38 @@ static void ReplaceReductionResults(SDNode *N, Results.push_back(SplitVal); } +static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, + DAG.getNode(ISD::SRL, DL, MVT::i128, N, + DAG.getConstant(64, DL, MVT::i64))); + return std::make_pair(Lo, Hi); +} + +static void ReplaceCMP_SWAP_128Results(SDNode *N, + SmallVectorImpl<SDValue> & Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i128 && + "AtomicCmpSwap on types less than 128 should be legal"); + auto Desired = splitInt128(N->getOperand(2), DAG); + auto New = splitInt128(N->getOperand(3), DAG); + SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, + New.first, New.second, N->getOperand(0)}; + SDNode *CmpSwap = DAG.getMachineNode( + AArch64::CMP_SWAP_128, SDLoc(N), + DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + Results.push_back(SDValue(CmpSwap, 0)); + Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(SDValue(CmpSwap, 3)); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9966,11 +10146,16 @@ void AArch64TargetLowering::ReplaceNodeResults( assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; + case ISD::ATOMIC_CMP_SWAP: + ReplaceCMP_SWAP_128Results(N, Results, DAG); + return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { - return true; + if (!Subtarget->isTargetAndroid() && !Subtarget->isTargetOpenBSD()) + return true; + return TargetLowering::useLoadStackGuardNode(); } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10017,14 +10202,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { - return true; + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + return getTargetMachine().getOptLevel() != 0; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); - bool IsAcquire = isAtLeastAcquire(Ord); + bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a @@ -10066,7 +10256,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = isAtLeastRelease(Ord); + bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form @@ -10104,6 +10294,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getIRStackGuard(IRB); + + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x28; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); @@ -10114,7 +10320,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons const unsigned TlsOffset = 0x48; Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); @@ -10166,3 +10372,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR( .addReg(NewVR); } } + +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on AArch64 is expensive. However, when aggressively + // optimizing for code size, we prefer to use a div instruction, as it is + // usually smaller than the alternative sequence. + // The exception to this is vector division. Since AArch64 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = + Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + return OptSize && !VT.isVector(); +} diff --git a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h index 151133b2f32..91905b2c383 100644 --- a/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/gnu/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -19,6 +19,7 @@ #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -32,38 +33,64 @@ class StringRef; class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum { +public: + enum ARMProcFamilyEnum : uint8_t { Others, CortexA35, CortexA53, CortexA57, + CortexA72, + CortexA73, Cyclone, - ExynosM1 + ExynosM1, + Kryo, + Vulcan }; +protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. - ARMProcFamilyEnum ARMProcFamily; + ARMProcFamilyEnum ARMProcFamily = Others; - bool HasV8_1aOps; - bool HasV8_2aOps; + bool HasV8_1aOps = false; + bool HasV8_2aOps = false; - bool HasFPARMv8; - bool HasNEON; - bool HasCrypto; - bool HasCRC; - bool HasPerfMon; - bool HasFullFP16; - bool HasSPE; + bool HasFPARMv8 = false; + bool HasNEON = false; + bool HasCrypto = false; + bool HasCRC = false; + bool HasRAS = false; + bool HasPerfMon = false; + bool HasFullFP16 = false; + bool HasSPE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. - bool HasZeroCycleRegMove; + bool HasZeroCycleRegMove = false; // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. - bool HasZeroCycleZeroing; + bool HasZeroCycleZeroing = false; // StrictAlign - Disallow unaligned memory accesses. - bool StrictAlign; + bool StrictAlign = false; + bool MergeNarrowLoads = false; + bool UseAA = false; + bool PredictableSelectIsExpensive = false; + bool BalanceFPOps = false; + bool CustomAsCheapAsMove = false; + bool UsePostRAScheduler = false; + bool Misaligned128StoreIsSlow = false; + bool AvoidQuadLdStPairs = false; + bool UseAlternateSExtLoadCVTF32Pattern = false; + bool HasMacroOpFusion = false; + bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; + uint8_t MaxInterleaveFactor = 2; + uint8_t VectorInsertExtractBaseCost = 3; + uint16_t CacheLineSize = 0; + uint16_t PrefetchDistance = 0; + uint16_t MinPrefetchStride = 1; + unsigned MaxPrefetchIterationsAhead = UINT_MAX; + unsigned PrefFunctionAlignment = 0; + unsigned PrefLoopAlignment = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -80,12 +107,20 @@ protected: AArch64InstrInfo InstrInfo; AArch64SelectionDAGInfo TSInfo; AArch64TargetLowering TLInfo; + /// Gather the accessor points to GlobalISel-related APIs. + /// This is used to avoid ifndefs spreading around while GISel is + /// an optional library. + std::unique_ptr<GISelAccessor> GISel; + private: /// initializeSubtargetDependencies - Initializes using CPUString and the /// passed in feature string so that we can use initializer lists for /// subtarget initialization. AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + /// Initialize properties based on the selected processor family. + void initializeProperties(); + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -93,6 +128,11 @@ public: const std::string &FS, const TargetMachine &TM, bool LittleEndian); + /// This object will take onwership of \p GISelAccessor. + void setGISelAccessor(GISelAccessor &GISel) { + this->GISel.reset(&GISel); + } + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } @@ -106,10 +146,20 @@ public: const AArch64RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } + const CallLowering *getCallLowering() const override; + const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isGeneric() || isCortexA53() || isCortexA57(); + return UsePostRAScheduler; + } + + /// Returns ARM processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + ARMProcFamilyEnum getProcFamily() const { + return ARMProcFamily; } bool hasV8_1aOps() const { return HasV8_1aOps; } @@ -126,6 +176,33 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasRAS() const { return HasRAS; } + bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool balanceFPOps() const { return BalanceFPOps; } + bool predictableSelectIsExpensive() const { + return PredictableSelectIsExpensive; + } + bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } + bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } + bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; } + bool useAlternateSExtLoadCVTF32Pattern() const { + return UseAlternateSExtLoadCVTF32Pattern; + } + bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool useRSqrt() const { return UseRSqrt; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + unsigned getVectorInsertExtractBaseCost() const { + return VectorInsertExtractBaseCost; + } + unsigned getCacheLineSize() const { return CacheLineSize; } + unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getMinPrefetchStride() const { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const { + return MaxPrefetchIterationsAhead; + } + unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } + unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -139,6 +216,7 @@ public: bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetIOS() const { return TargetTriple.isiOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetOpenBSD() const { return TargetTriple.isOSOpenBSD(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } bool isTargetAndroid() const { return TargetTriple.isAndroid(); } @@ -146,13 +224,7 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isGeneric() const { return CPUString == "generic"; } - bool isCyclone() const { return CPUString == "cyclone"; } - bool isCortexA57() const { return CPUString == "cortex-a57"; } - bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool isExynosM1() const { return CPUString == "exynos-m1"; } - - bool useAA() const override { return isCortexA53(); } + bool useAA() const override { return UseAA; } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -174,8 +246,7 @@ public: /// returns null. const char *getBZeroEntry() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, - MachineInstr *end, + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; bool enableEarlyIfConversion() const override; |